In [1]:
import numpy as np
import pandas as pd
import sklearn
%matplotlib inline

In [2]:
data_df = pd.read_csv('/home/james/anaconda3/data/censusdata.csv',header=None)

Using the Census Income data set from UCI Machine learning repository.
https://archive.ics.uci.edu/ml/datasets/Adult

The data is a mix of various continuous and categorical features to be described in the above link and the goal is predict whether the adult makes over $50,000 annually.



In [3]:
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Labelling the columns according to their feature.

In [4]:
data_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                   'occupation', 'relationship', 'race','sex', 'capital_gain', 'capital_loss', 
                   'hours_per_week', 'native_country', 'label']

In [5]:
data_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
categorical_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship',
                       'race', 'sex', 'native_country', 'label']

In [7]:
data_df_dum = pd.get_dummies(data_df, columns = categorical_features, drop_first=1 )

Using the pandas' method get_dummies to do a 'one-hot-encoding' of the categorical features to use in the classification model. 

In [8]:
data_df_dum.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,label_ >50K
0,39,77516,13,2174,0,40,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50,83311,13,0,0,13,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,38,215646,9,0,0,40,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,53,234721,7,0,0,40,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,28,338409,13,0,0,40,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [10]:
X = data_df_dum.ix[:,:100].as_matrix()
Y = data_df_dum['label_ >50K'].as_matrix()

The authors used a variety of classifiers and ahve their errors posted here:
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names

They did use any ensemble methods, so I will run test the performance of a random forest classifier and a gradient boosted tree algorithm.

In [28]:
clf_1 = GradientBoostingClassifier(learning_rate=0.009, n_estimators=800, subsample=0.95, verbose=1,
                                  max_depth=6)
clf_2 = RandomForestClassifier(n_estimators=500,verbose=0, n_jobs=-1)

The authors didn't give a fixed test set but did mention they used a 2/3, 1/3 split. Will use Monte Carlo CV to estimate the expected error rate of the two classifiers on this test split. So will choose a random split to test.

In [29]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, Y, test_size=0.33, random_state=3)
scaler.fit(X_tr)
X_s_tr = scaler.transform(X_tr)
X_s_ts = scaler.transform(X_ts)
clf_1.fit(X_s_tr, y_tr)
clf_2.fit(X_s_tr, y_tr)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0911           0.0092            2.26m
         2           1.0820           0.0083            2.27m
         3           1.0756           0.0079            2.23m
         4           1.0683           0.0082            2.23m
         5           1.0570           0.0076            2.23m
         6           1.0507           0.0073            2.25m
         7           1.0445           0.0070            2.24m
         8           1.0353           0.0075            2.25m
         9           1.0283           0.0065            2.25m
        10           1.0217           0.0068            2.26m
        20           0.9609           0.0057            2.23m
        30           0.9078           0.0048            2.20m
        40           0.8671           0.0033            2.18m
        50           0.8318           0.0032            2.15m
        60           0.8013           0.0025            2.14m
       

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [33]:
algorithms = ['C4.5', 'C4.5-auto', 'C4.5 rules', 'Voted ID3 (0.6)', 'Voted ID3 (0.8)', 
              'T2', '1R', 'NBTree', 'CN2', 'HOODG', 'FSS Naive Bayes', 'IDTM (Decision table)',
              'Naive-Bayes', 'Nearest-neighbor (1)','Nearest-neighbor (3)', 'OC1',
              'Gradient Bosoted Tree', 'Random Forest']
scores = np.array([15.54, 14.46, 14.94, 15.64, 16.47, 16.84, 19.54, 14.10, 16.00, 14.82, 14.05,
                   14.46, 16.12, 21.42, 20.35, 15.04,
                   round(100*(1-accuracy_score(y_ts, clf_1.predict(X_s_ts))),2),
                   round(100*(1-accuracy_score(y_ts, clf_2.predict(X_s_ts))),2)])
results_df = pd.DataFrame(scores)
results_df.columns = ['Error']
results_df.index = algorithms
results_df

Unnamed: 0,Error
C4.5,15.54
C4.5-auto,14.46
C4.5 rules,14.94
Voted ID3 (0.6),15.64
Voted ID3 (0.8),16.47
T2,16.84
1R,19.54
NBTree,14.1
CN2,16.0
HOODG,14.82


Results of previous authors together with new results at bottom of chart.

In [37]:
confusion_grad = pd.DataFrame(confusion_matrix(y_ts, clf_1.predict(X_s_ts)))
confusion_grad.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']
confusion_grad.index = ['Actual: Under 50K', 'Actual: Over 50K']
confusion_grad.columns.name = 'GBC'

In [38]:
confusion_grad

GBC,Predicted: Under 50K,Predicted: Over 50K
Actual: Under 50K,7660,471
Actual: Over 50K,913,1702


In [39]:
confusion_rf = pd.DataFrame(confusion_matrix(y_ts, clf_2.predict(X_s_ts)))
confusion_rf.columns = ['Predicted: Under 50K', 'Predicted: Over 50K']
confusion_rf.index = ['Actual: Under 50K', 'Actual: Over 50K']
confusion_rf.columns.name = 'Random Forest'

In [40]:
confusion_rf

Random Forest,Predicted: Under 50K,Predicted: Over 50K
Actual: Under 50K,7546,585
Actual: Over 50K,972,1643
