In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
# build a data frame
data_url = 'https://raw.githubusercontent.com/hcimwtc/ML2022/main/adult.data'
column_headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                  'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                  'hours-per-week', 'native-country', 'income']
df_adult_original = pd.read_csv(data_url, header = None, names = column_headers, skipinitialspace = True)

# INSPECTION STEP

This step is to check the detail of the dataset, try to understand each attributes' meaning and the relationship between each other and the target before moving on to pre-processing.

In [3]:
# make a copy of data frame for pre-processing procedure (to prevent contamination)
df_adult = df_adult_original.copy()

In [4]:
# inspect numeric data
df_adult.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
# inspect categorical data
for col in df_adult:
    if df_adult[col].dtype == object:
        print('\"', col, '\"\n', df_adult[col].value_counts(), '\n', sep='')

"workclass"
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

"education"
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

"marital-status"
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital-status, dtype: int64

"occupation"
Prof-specialty       4140
Craft-repair       

In [6]:
# evaluate the correlation between columns
df_adult.apply(lambda x: x.factorize()[0]).corr(method='kendall')

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
age,1.0,0.031905,0.046386,0.027262,0.027262,0.003355,0.032114,-0.005663,-0.010572,0.014617,0.010831,-0.009502,0.050615,-0.014164,-0.041404
workclass,0.031905,1.0,0.010433,-0.000562,-0.000562,0.004525,0.094169,0.025948,0.030842,0.049831,0.017783,0.004805,-0.01444,-0.004179,0.02795
fnlwgt,0.046386,0.010433,1.0,0.02456,0.02456,0.010897,0.007057,0.020303,0.038377,0.074331,-0.004823,-0.011364,0.012187,-0.009709,-0.049334
education,0.027262,-0.000562,0.02456,1.0,1.0,0.004196,0.060021,0.04742,0.014196,0.017529,-0.012832,-0.009978,0.032495,0.057953,-0.064145
education-num,0.027262,-0.000562,0.02456,1.0,1.0,0.004196,0.060021,0.04742,0.014196,0.017529,-0.012832,-0.009978,0.032495,0.057953,-0.064145
marital-status,0.003355,0.004525,0.010897,0.004196,0.004196,1.0,0.000352,-0.045543,-0.014501,0.051657,0.051529,0.023584,-0.025133,0.000561,0.119769
occupation,0.032114,0.094169,0.007057,0.060021,0.060021,0.000352,1.0,-0.01087,0.002759,-0.164242,-0.027716,-0.017522,0.00379,0.016443,-0.088427
relationship,-0.005663,0.025948,0.020303,0.04742,0.04742,-0.045543,-0.01087,1.0,0.089748,0.173256,-0.030383,-0.028163,0.018878,0.036246,-0.072213
race,-0.010572,0.030842,0.038377,0.014196,0.014196,-0.014501,0.002759,0.089748,1.0,0.097463,-0.027429,-0.018604,-0.050235,0.210846,-0.081429
sex,0.014617,0.049831,0.074331,0.017529,0.017529,0.051657,-0.164242,0.173256,0.097463,1.0,-0.062992,-0.039678,0.045239,-0.005641,-0.21598


In [7]:
# rank the correlation of columns in respect to the target ('income')
df_corr = df_adult.apply(lambda x: x.factorize()[0]).corr(method='kendall')
df_corr['income'].abs().sort_values()

hours-per-week    0.011614
workclass         0.027950
native-country    0.033885
age               0.041404
fnlwgt            0.049334
education         0.064145
education-num     0.064145
relationship      0.072213
race              0.081429
occupation        0.088427
marital-status    0.119769
capital-loss      0.133986
sex               0.215980
capital-gain      0.259145
income            1.000000
Name: income, dtype: float64

In [8]:
# inspect NAN values in data
df_adult.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

# '''DATA PRE-PROCESSING'''

Since this is a relatively huge dataset with 11 attributes and over 30 thousands of instances, it is important to do pre-processing to increase the accuracy for later. 

First I have dropped irrelevant columns such as 'fnlwgt', 'education', 'capital-gain', 'capital-loss', "hour-per-week', 'relationship' and 'native-country' because they do not have a direct influence towards the target prediction. 

- According to **.describe()** in the previous step, 
  - captial-gain and captial-loss value is 0, which shown none useful information to the income and hence are dropped. 


- As for the workclass, since the data size various, I have divided and group them into 4 groups.

   1) Private | 2) Government | 3) Self-Employment (including "?") 
without-pay and umemployed are dropped because they provide no insight to the prediction. 


- For the Age group, social norm suggests the normal age range for retirement is around 50~60. 

    - Hence, I categorised age group below 28 as society freshmen.

    48 as almost-retire.  

    48 above would be categorised as retirment age range. 

- Marital Status is anothet group that have various instances 
 - so, they will be grouped as "Married"
 - Divorced, Widowed, Seperated as "Separated" 
 - "Never-Married" as one group Signle. 


- Race is kept instead of native-country because the correlation result is better and thus, is divided into two large group since the instances vary and imbalanced. 
  - Race : White & Non-White 


In [9]:
# drop columns that are duplicate, irrelevant, or made up of unbalanced data
df_adult.drop(columns = ['fnlwgt', 'education', 'relationship', 'capital-gain', 'capital-loss',
                         'native-country', 'hours-per-week'], inplace=True)
# drop row that are irrelevant to target
df_adult = df_adult[df_adult.workclass.isin(['Without-pay', 'Never-worked']) == False]

In [10]:
# merge small subclasses of 'workclass'
df_adult.loc[df_adult.workclass.isin(['Local-gov', 'State-gov', 'Federal-gov']), 'workclass'] = 'Govt'
df_adult.loc[df_adult.workclass.isin(['Self-emp-not-inc', 'Self-emp-inc', '?']), 'workclass'] = 'SE/FL'
# merge small subclasses of 'marital-status'
df_adult.loc[df_adult['marital-status'].isin(['Never-married']), 'marital-status'] = 'Single'
df_adult.loc[df_adult['marital-status'].isin(['Divorced', 'Widowed']), 'marital-status'] = 'Separated'
df_adult.loc[df_adult['marital-status'].isin(['Married-civ-spouse', 'Married-spouse-absent', 
                                              'Married-AF-spouse']), 'marital-status'] = 'Married'
# merge small subclasses of 'occupation'
df_adult.loc[df_adult.occupation.isin(['Protective-serv', 'Priv-house-serv']), 'occupation'] = 'Other-service'
df_adult.loc[df_adult.occupation.isin(['?', 'Armed-Forces']), 'occupation'] = 'Others'
# merge small subclasses of 'race'
df_adult.loc[(df_adult['race'] != 'White'), 'race'] = 'Non-White'
# turn 'income' into boolean value
df_adult.loc[(df_adult['income'] == '<=50K'), 'inc>50k'] = False
df_adult.loc[(df_adult['income'] == '>50K'), 'inc>50k'] = True

In [11]:
# inspect the cleaned data again
df_corr = df_adult.apply(lambda x: x.factorize()[0]).corr(method='kendall')
df_corr['inc>50k'].abs().sort_values()

age               0.041246
education-num     0.064099
workclass         0.078326
race              0.085222
occupation        0.095305
marital-status    0.132602
sex               0.216070
income            1.000000
inc>50k           1.000000
Name: inc>50k, dtype: float64

'''TRANSFORMATION'''

In [13]:
# define X, y
y = df_adult['inc>50k'].astype('category')
X = df_adult.drop(columns = ['income', 'inc>50k'])

# sort columns into different categories for encoding
numeric_feats = ['age', 'education-num']
categorical_feats = ['workclass', 'marital-status', 'race', 'sex', 'occupation']

# perform both encoding procedures in one go
cT = make_column_transformer((StandardScaler(), numeric_feats), 
                             (OneHotEncoder(), categorical_feats),)

# define new X for train/test splitting
transformed = cT.fit_transform(X).toarray()
column_names = (numeric_feats + cT.named_transformers_["onehotencoder"].get_feature_names_out().tolist())
X = pd.DataFrame(transformed, columns=column_names)

'''MODEL TRAINING & CROSS VALIDATION'''

'''DECISION TREE'''

In [14]:
# train/test split with KFold (K=2)
kf = KFold(n_splits=2)
DT = DecisionTreeClassifier(random_state=77)
scoreDT = []

# start training and evaluate score
count = 0
for (train_idx, test_idx) in kf.split(X, y):
    count += 1
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    DT.fit(X_train, y_train) 
    scoreDT.append(DT.score(X_test, y_test))
    y_pred = DT.predict(X_test)
    f = f1_score(y_true = y_test, y_pred = y_pred, average = 'weighted')
    print('f1 #', count, ': ', f, sep='')
    
print('Best:', max(scoreDT))
print('Worst:', min(scoreDT))
print('Overall:', np.mean(scoreDT))

f1 #1: 0.781134940462762
f1 #2: 0.7822445232461104
Best: 0.7872157344806392
Worst: 0.7841425937307929
Overall: 0.785679164105716


In [15]:
# score by cross-validation with default function
print('Cross-validation Score:', cross_val_score(DT, X, y, cv=2))

Cross-validation Score: [0.78334358 0.78684696]


'''NEURAL NETWORK'''

In [16]:
# create a definition to train and eval score for NN model
def NN_score(hid, lr, mmt):
    scoreNN = []
    NN = MLPClassifier(hidden_layer_sizes = (hid,), learning_rate_init = lr, momentum = mmt, random_state = 77)
    
    count = 0
    for (train_idx, test_idx) in kf.split(X, y):
        count += 1
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        NN.fit(X_train, y_train) 
        scoreNN.append(NN.score(X_test, y_test))
        y_pred = NN.predict(X_test)
        f = f1_score(y_true = y_test, y_pred = y_pred, average = 'weighted')
        print('f1 #', count, ': ', f, sep='')
        
    print('Best:', max(scoreNN))
    print('Worst:', min(scoreNN))
    print('Overall:', np.mean(scoreNN))

In [17]:
# train NN with hidden-layer = 2, learning-rate = 0.1, momentum = 0.9
NN_score(2, 0.01, 0.9)

f1 #1: 0.8155326503062774
f1 #2: 0.8174442345424123
Best: 0.8266133988936694
Worst: 0.8242163491087892
Overall: 0.8254148740012293


In [18]:
#Adjusting the parameter with hidden-layer = 3, learning-rate = 0.1, momentum = 0.9
NN_score(3, 0.01, 0.9)

f1 #1: 0.8204638236545304
f1 #2: 0.8257865294236952
Best: 0.8325138291333744
Worst: 0.8290719114935464
Overall: 0.8307928703134604


In [19]:
##Adjusting the parameter with hidden-layer = 3, learning-rate = 0.007, momentum = 0.9
NN_score(3, 0.007, 0.9)

f1 #1: 0.8242585438912037
f1 #2: 0.8251983127371815
Best: 0.8330669944683466
Worst: 0.8294406883835279
Overall: 0.8312538414259373


# Conclusion

Based on the result, Decision Tree has a overall score of 0.78 whilst ANN has an overall accuracy of 0.82, adjusting the parameter slightly increase the accuracy rate. Thus, even if Decision Tree has a simplier algorithm, we can see that with a dataset contains over 30 thousand values, ANN has better result than Decision Tree.