# Learning goals:
    - Get introduced to a new machine learning library
    - Learn about a decision tree classifier
    - Practice more data manipulation techniques
    - Get introduced to bias in machine learning

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from IPython.display import Image
from sklearn import preprocessing, neighbors
print("Imported all required components")

Imported all required components


In [134]:
class Adult(object):
    
    DIRECTORY = os.path.join(os.getcwd(), 'data','adult')
    DATA_PATH = os.path.join(DIRECTORY, 'adult.data')
    TEST_PATH = os.path.join(DIRECTORY, 'adult.test')
    COLUMN_NAMES = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", 
                "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary"]
    CONTINUOUS_FEATURES = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    CATEGORICAL_FEATURES = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country", "salary"]
    
    def __init__(self):
        train_data = pd.read_csv(Adult.DATA_PATH, names=Adult.COLUMN_NAMES).dropna()
        test_data = pd.read_csv(Adult.TEST_PATH, names=Adult.COLUMN_NAMES).dropna()
        test_data['age'] = test_data.age.apply(lambda x: float(x)) # convert strings to floats
        self._train_data = Adult.to_one_hot(Adult.normalize(train_data))
        self._test_data = Adult.to_one_hot(Adult.normalize(test_data))
        # TODO: Fix train and test data to ensure they have the same number of features
        self.rename_columns()
    
    def rename_columns(self):
        self._train_data[['salary_<=50K', 'salary_>50K']] = self._train_data[['salary_ <=50K', 'salary_ >50K']]
        self._train_data.drop(columns=['salary_ <=50K', 'salary_ >50K'])
    @staticmethod
    def normalize(dataset):
        X = dataset[Adult.CONTINUOUS_FEATURES]
        dataset[Adult.CONTINUOUS_FEATURES] = (X - np.mean(X))/ np.std(X)
        return dataset
    
    @staticmethod
    def to_one_hot(dataset):
        dataset = pd.get_dummies(dataset, columns=Adult.CATEGORICAL_FEATURES)
        return dataset
    @staticmethod
    def split_data(data):
        data_columns = [col for col in data if not col.startswith('salary_')]
        target_columns = [col for col in data if col.startswith('salary_')]
        return data[data_columns].as_matrix(), data[target_columns].as_matrix()

    @property
    def train_data(self):
        return Adult.split_data(self._train_data)
    
    
    def align_with_train_data(self, test_dataset):
        train_columns = [col for col in self._train_data]
        test_columns=[col for col in test_dataset]
        for column in train_columns:
            if not column in test_columns and not column.startswith('salary_'):
                print("absent column: ", column)
                test_dataset[column] =  0 # Add new series with zeroes everywhere
        aligned_df = test_dataset[train_columns]
        return aligned_df
    
    @property
    def test_data(self):
        aligned_df = self.align_with_train_data(self._test_data)
        return Adult.split_data(aligned_df)
    

In [135]:
dataset = Adult()

In [137]:
Xtrain, Ytrain = dataset.train_data
Xtest, Ytest = dataset.test_data



KeyError: "['salary_ <=50K' 'salary_ >50K' 'salary_<=50K' 'salary_>50K'] not in index"

In [113]:
Xtest[:1], Ytest[:1]

(array([[-0.99412926,  0.35347399, -1.19686359, -0.14266185, -0.21806206,
         -0.03143184,  0.        ,  0.        ,  0.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
          1.        ,  0.        ,  0.



# Predict salary class

In [85]:
from sklearn import tree

In [101]:
Xtrain.shape, Xtest.shape

((32561, 108), (16281, 108))

In [108]:
n_neighbors = 10
clf = neighbors.KNeighborsClassifier(n_neighbors)

In [110]:
clf.fit(Xtrain, Ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [111]:
clf.score(Xtest, Ytest)

0.053743627541305815