# Homework 3: Classification using sklearn  
- Francisco McGee
- CIS 5526, FALL 2018
- Dr. Slobodan Vucetic

# HEADS UP!!!!!
- I'm going to do all the processing for adult_data first, and then I'm going to run the adult_data through the sklearn pipeline along with the original datasets, including the iris dataset.

In [1]:
import sklearn
from sklearn import datasets
import numpy as np
from numpy import array
from numpy import argmax

import pandas as pd
from copy import deepcopy

### Helper functions, variables

In [2]:
page_break = "#" * 85


def get_columns_dict(header):
    header_dict = dict()
    counter = 1
    for item in header:
        header_dict[item] = counter
        counter += 1
    return header_dict

def get_column_index(columns, header_dict):
    index = []
    for column in columns:
        index.append(header_dict[column])
    
    index[:] = [x - 1 for x in index]
    return index

def get_columns(header_string):
    lines = header_string.splitlines()
    header = []
    for line in lines:
        sub_line = line.split(" ")
        for sub in sub_line:
            if ":" in sub:
                word = sub.replace(":", "")
                header.append(word)
    return header



def get_uniques(df, should_encode):
    uniques = dict()
    
    for column in data:
        if column in should_encode:
            uniques[column] = data[column].unique()
    
    return uniques


def get_one_hot_column(uniques_column):
    values = array(uniques_column)                                 # to numpy array
    label_encoder = LabelEncoder()                                 # create LabelEncoder
    # integer encoding
    integer_encoded = label_encoder.fit_transform(values)          # perform integer encoding
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)                             # create one-hot encoder
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)       # reshape
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)           # perform one-hot encoding
    return onehot_encoded


# input: Pandas dataframe, and a list of columns that need to be one-hot encoded
# output: one-hot encoded Pandas dataframe
def get_one_hot_df(df, should_encode):
    for col in should_encode:    
        one_hot = pd.get_dummies(df[col])                  # generate the one-hot version of the column
        df = df.drop(col, axis=1)                          # delete the original column
        df = pd.concat([df, one_hot], axis=1, sort=False)  # add the one-hot dataframe to the original dataframe
        
    return df

# Pre-process adult_data
### the next couple blocks of code will munge the adult_data for input to the sklearn ML code
(1) get header for adult_data

(2) one-hot encode the categorical columns of adult_data in a Pandas dataframe

(3) munge the dataframe back into numpy arrays for input into the sklearn ML code

## (1) get header for adult_data
- "column_string" was copy/pasted from the adult_data description
- the following code block parses "column_string" to get the columns for the adult_data

In [3]:
columns_string = '''
age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
'''

columns = get_columns(columns_string)
columns.append("income")
print(columns)
columns_dict = get_columns_dict(columns)
print(columns_dict)

print("\ncompleted get header")

['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
{'age': 1, 'workclass': 2, 'fnlwgt': 3, 'education': 4, 'education-num': 5, 'marital-status': 6, 'occupation': 7, 'relationship': 8, 'race': 9, 'sex': 10, 'capital-gain': 11, 'capital-loss': 12, 'hours-per-week': 13, 'native-country': 14, 'income': 15}

completed get header


## (2) one-hot encode the categorical columns of adult_data in a Pandas dataframe
- import adult_data into a Pandas dataframe
- manually identify the categorical columns in "should_encode"
- manually encode the 'income'
- the following code block parses "column_string" to get the columns for the adult_data
- one-hot encode the necessary columns in the dataframe using "get_one_hot_df"

In [4]:
file_name = "adult.data"
data = pd.read_csv(file_name, names=columns)
should_encode = ['race', 'education', 'sex', 'occupation', 'relationship', 
                 'workclass', 'native-country', 'marital-status']

# encode the target column with 0, 1 for classification
data['income'] = data['income'].map({' <=50K' : 0, ' >50K' : 1,})
data_copy = deepcopy(data)               # keep a deepcopy in case I screw up the original and need it again

one_hot_df = get_one_hot_df(data, should_encode) 

print("completed one-hot encoding")

completed one-hot encoding


## (3) munge the dataframe back into numpy arrays for input into the sklearn ML code


In [5]:
one_hot_df_copy = deepcopy(one_hot_df)
one_hot_targets = one_hot_df['income'].tolist()              # this is the y, targets
one_hot_df = one_hot_df.drop('income', axis=1)  
one_hot_data = one_hot_df.values.tolist()                   # this is the X, training data
# convert targets, data to np.array
one_hot_targets = np.asarray(one_hot_targets)
one_hot_data = np.asarray(one_hot_data)
one_hot_dataset = (one_hot_data, one_hot_targets)           # make a tuple of the data, targets

print("completed dataframe munging to lists")

completed dataframe munging to lists


# The sklearn stuff starts here

### munge the iris datasets a little bit

In [6]:
iris = sklearn.datasets.load_iris()

iris_trunc = []

for l in iris.data:
    iris_trunc.append(l[2:4].tolist())
    
iris_data = np.asarray(iris_trunc)
iris_dataset = (iris_data, iris.target)

print(iris_dataset)
print("completed munging iris dataset")

(array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9

## Here is the original code from the assignment, with 2 added datasets:
### (1) iris_dataset
### (2) adult_dataset
## For convenience, I print out which dataset, classifier, and score.
* NOTE: visualization has been disabled for the iris and adult datasets because of dimensionality problems

In [7]:
# Code source: Gaël Varoquaux
#              Andreas Müller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh


names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
#    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]
#    GaussianNB(),
#    QuadraticDiscriminantAnalysis()]


X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)

rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable,
            iris_dataset,
            one_hot_dataset
            ]


In [8]:

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable,
            iris_dataset,
            one_hot_dataset
            ]
dataset_counter_dict = {1: "moons", 2: "circles", 3: "linearly_separable", 4: "iris_dataset", 5: "adult_data"}

dataset_counter = 1

scores = dict()

<Figure size 2700x900 with 0 Axes>

In [13]:

for ds_cnt, ds in enumerate(datasets):
    print(page_break)
    print("dataset:\t", dataset_counter_dict[dataset_counter])
    print(page_break)
    

    # preprocess dataset, split into training and test part
    X, y = ds
    #print("X.shape", X.shape)
    
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1

    # iterate over classifiers
    dataset_scores = []
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        #print(page_break)
        #print("X_train shape:", X_train.shape)
        #print(page_break)

        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print("\tname:", name, "\t\t\tscore:", score)
        dataset_scores.append(score)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        #print(page_break)
        #print("that np.c_...ravel stuff: ", np.c_[xx.ravel(), yy.ravel()].shape)
        #print("xx.shape:", xx.shape, len(xx))
        #print("yy.shape:", yy.shape, len(yy))
        #print(page_break)

        try:
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot also the training points
            ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
            # and testing points
            ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                       alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name)
            ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=15, horizontalalignment='right')
            i += 1
        except(ValueError):
            print("dimensions cannot be visualized")
    scores[dataset_counter_dict[dataset_counter]] = dataset_scores
    dataset_counter += 1
plt.tight_layout()
plt.show()

print(page_break)
print("\n completed sklearn ML pipeline")

#####################################################################################


KeyError: 6

The code above trained several types of classifiers on 3 synthetic data sets. Among them are kNN and feedforward neural networks. There are also some other algorithms we will introduce during the course. At the moment, you do not have to worry what they are and how they work. 

**Question 1**. Study the code and try to understand what each line does. In particular, pay attention to how easy it is to train predictors of different types. Run the code. You should be able to see a nice display demonstrating performance of different algorithms on 3 data sets.

**Question 2**. Train all the listed classifiers on Iris data (you can load it using *iris = sklearn.datasets.load_iris()*) and test their accuracy. Report and discuss the results

**Question 3**. Play with the hyperparameters of each of the algorithms. Try to improve the accuracy on the test data. 

**Question 4**. Perform the exploratory data analysis of  the Adult Data Set from http://archive.ics.uci.edu/ml/datasets/Adult. Train and estimate accuracy of all of all the classifiers from Question 1. Note that you will have to preprocess your data set before training. Explain all the preprocessing steps you applied and report on the accuracy on test data. Report all EDA and classification results in a 1-page document.


# Question 4 Response:
### My steps to pre-process the adult_data were:
(1) get header for adult_data

(2) one-hot encode the categorical columns of adult_data in a Pandas dataframe

(3) munge the dataframe back into numpy arrays for input into the sklearn ML code

# Now, compare accuracy scores of the classifiers across the datasets:

In [10]:
names_dict = dict()
names_counter = 0
for name in names:
    names_dict[names_counter] = name
    names_counter += 1
    

scores_df = pd.DataFrame.from_dict(scores)
scores_df = scores_df.rename(index=names_dict)
scores_df

Unnamed: 0,moons,circles,linearly_separable,iris_dataset,adult_data
Nearest Neighbors,0.975,0.925,0.925,0.983333,0.814818
Linear SVM,0.875,0.4,0.925,0.983333,0.850672
RBF SVM,0.975,0.875,0.95,0.983333,0.787869
Decision Tree,0.95,0.775,0.95,0.983333,0.851823
Random Forest,0.925,0.75,0.95,0.983333,0.759693
Neural Net,0.9,0.75,0.95,0.983333,0.855969
AdaBoost,0.925,0.825,0.95,1.0,0.862649


In [11]:
scores_df.describe()

Unnamed: 0,moons,circles,linearly_separable,iris_dataset,adult_data
count,7.0,7.0,7.0,7.0,7.0
mean,0.932143,0.757143,0.942857,0.985714,0.826213
std,0.037401,0.170608,0.012199,0.006299,0.039775
min,0.875,0.4,0.925,0.983333,0.759693
25%,0.9125,0.75,0.9375,0.983333,0.801344
50%,0.925,0.775,0.95,0.983333,0.850672
75%,0.9625,0.85,0.95,0.983333,0.853896
max,0.975,0.925,0.95,1.0,0.862649


In [301]:
transposed = scores_df.transpose()
transposed.describe()

Unnamed: 0,Nearest Neighbors,Linear SVM,RBF SVM,Decision Tree,Random Forest,Neural Net,AdaBoost
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.92463,0.786801,0.914241,0.907016,0.878605,0.907753,0.899196
std,0.067161,0.217882,0.082544,0.077503,0.103793,0.058374,0.05304
min,0.814818,0.4,0.787869,0.8,0.759693,0.85,0.825
25%,0.925,0.850672,0.875,0.851747,0.775,0.855432,0.862649
50%,0.925,0.875,0.95,0.95,0.925,0.9,0.925
75%,0.975,0.883333,0.975,0.95,0.95,0.95,0.933333
max,0.983333,0.925,0.983333,0.983333,0.983333,0.983333,0.95
