TODO:
 1. Add more exploratory data analysis (and its own section)
 2. Figure out a better way to merge the columns
 3. Use cross validation in training
 4. Hyper parameter tuning for all of the classifiers

ideas
- bin education (highscool or below, bachelors, masters, doctorate, associate)
- is fnlwgt useful? -- read more on this one
- look into what '?' is for workclass, do we want to drop this attribute?
	- same issue for native country
- possibly bin martial status as well -- as this may not be useful to have so many classes
- see how useful capital loss/gains are -- this may indicate the person is active in the stock market and therefore may have more disposable income
- hours per week may also be a strong indicator of if they make over 50k a year
- what is relationship in the context of the dataset?
- can we further bin the occupation category as well?

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Data preprocessing

In [2]:
data = pd.read_csv('data/adult.data')
test = pd.read_csv('data/adult.test')

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary-class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
# get discrete class columns from data, and one hot encode
discrete_classes = ['workclass',
                    'education', 
                    'sex',
                    'marital-status',
                    'occupation',
                    'relationship', 
                    'native-country',
                    'race', 
                    'salary-class']
encoded_train = pd.get_dummies(data[discrete_classes])
encoded_test = pd.get_dummies(test[discrete_classes])

# drop old non-encoded columns from data, and add encoded data
data.drop(columns=discrete_classes, inplace=True)
data = pd.concat([data, encoded_train], axis=1)
test.drop(columns=discrete_classes, inplace=True)
test = pd.concat([test, encoded_test], axis=1)

# drop extra output column as 'salary <= 50k' -> 0, and 'salary >50k' -> 1
data.drop(columns=['salary-class_ <=50K'], inplace=True)
data.rename(columns={'salary-class_ >50K': 'salary-class'}, inplace=True)
test.drop(columns=['salary-class_ <=50K'], inplace=True)
test.rename(columns={'salary-class_ >50K': 'salary-class'}, inplace=True)

In [6]:
#TODO: figure out what to do in order to merge the columns better
x_keys = set(data.keys()) - set(['salary-class']) & set(test.keys()) - set(['salary-class'])
# x_test_keys = set(test.keys()) - set(['salary-class'])
y_keys = set(['salary-class'])

X_train = data[x_keys]
X_test = test[x_keys]

Y_train = data[y_keys]
Y_test = test[y_keys]

# Experiments

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

## Decision Tree classifier

In [8]:
dt_classifier = DecisionTreeClassifier(max_depth = 5)
dt_classifier.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=5)

In [9]:
dt_classifier.score(X_test, Y_test)

0.8519746944290891

## Support Vector Machine classifier

In [16]:
svc = SVC()
svc.fit(X_train, Y_train['salary-class'])

SVC()

In [17]:
svc.score(X_test, Y_test)

0.7986610159081138

## K Nearest Neighbors classifier

In [21]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, Y_train['salary-class'])

KNeighborsClassifier()

In [22]:
knn_classifier.score(X_test, Y_test)

0.7767950371598796

## Neural Network classifier

In [24]:
nn_classifier = MLPClassifier()
nn_classifier.fit(X_train, Y_train['salary-class'])

MLPClassifier()

In [25]:
nn_classifier.score(X_test, Y_test)

0.7919660954486825

## Boosted Decision Trees classifier

In [31]:
clf = xgb.XGBClassifier()
clf.fit(X_train, Y_train['salary-class'])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [32]:
clf.score(X_test, Y_test)

0.8721822983846201