In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

# Data Preprocessing

In [2]:
# data source: http://archive.ics.uci.edu/ml/datasets/Adult

dfTrain = pd.read_csv('data/adult.data',header=None, sep=',')
dfTest = pd.read_csv('data/adult.test', header=None, sep=',', skiprows=[0])

# assign column names
dfTrain.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                   'relationship','race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                   'income']
dfTest.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
                  'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                  'income']

print(dfTrain.shape)
print(dfTest.shape)

(32561, 15)
(16281, 15)


In [3]:
# Clean data
#Remove "." from income
dfTrain["income"] = dfTrain["income"].str.replace(".","",regex=True)
dfTest["income"] = dfTest["income"].str.replace(".","",regex=True)

# Remove question mark
dfTrain = dfTrain[(dfTrain.values !=' ?').all(axis=1)]
dfTest = dfTest[(dfTest.values !=' ?').all(axis=1)]

print(dfTrain.shape)
print(dfTest.shape)

(30162, 15)
(15060, 15)


In [4]:
dfTrain.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
dfTrain.describe().round(3)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.438,189793.834,10.121,1092.008,88.372,40.931
std,13.135,105652.972,2.55,7406.346,404.298,11.98
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.25,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
# Normalize numerical attributes
features = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']

scaler = preprocessing.StandardScaler().fit(dfTrain[features])
dfTrain[features] = scaler.transform(dfTrain[features])

dfTest[features] = scaler.transform(dfTest[features])

display(dfTrain.describe().round(3))

# Discretize continous attributes based on their mean value
def numericalBinary(dataset, features):
    dataset[features] = np.where(dataset[features] >= dataset[features].mean(), 1,0)

numericalBinary(dfTrain,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])
numericalBinary(dfTest,['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week'])

display(dfTrain.describe().round(3))

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,0.0,0.0,-0.0,-0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.632,-1.666,-3.577,-0.147,-0.219,-3.333
25%,-0.795,-0.683,-0.44,-0.147,-0.219,-0.078
50%,-0.109,-0.108,-0.048,-0.147,-0.219,-0.078
75%,0.652,0.453,1.129,-0.147,-0.219,0.34
max,3.926,12.256,2.305,13.355,10.556,4.847


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,0.463,0.438,0.328,0.082,0.047,0.305
std,0.499,0.496,0.47,0.274,0.212,0.46
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Discretize categorical attributes
# for col in ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']:
#     unique_vals = pd.unique(dfTrain[col])
#     dfTrain[col].replace(to_replace=unique_vals, value=np.arange(len(unique_vals)), inplace=True)
#     dfTest[col].replace(to_replace=unique_vals, value=np.arange(len(unique_vals)), inplace=True)

# print(dfTrain.shape)
# print(dfTest.shape)

# # Encoder method 1: use one-hot encoder
def oneHotBind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[feature_to_encode])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop(feature_to_encode, axis=1)
    return(res)

dfTrain = oneHotBind(dfTrain,['workclass','education','marital-status','occupation',
                              'relationship','race','sex','native-country'])
dfTest  = oneHotBind(dfTest, ['workclass','education','marital-status','occupation',
                              'relationship','race','sex','native-country'])

# Add missing attributes
for attributes in dfTrain.keys():
    if attributes not in dfTest.keys():
        print("Adding missing feature {}".format(attributes))
        dfTest[attributes] = 0
        
print(dfTrain.shape)
print(dfTest.shape)

# Train and test set for Neural Network
dfTrain_NN = dfTrain.copy(deep=True)
dfTest_NN = dfTest.copy(deep=True)

Adding missing feature native-country_ Holand-Netherlands
(30162, 105)
(15060, 105)


In [8]:
def encode_income(dataset):
    le = preprocessing.LabelEncoder()
    le = le.fit(dataset['income'])
    dataset['income'] = le.transform(dataset['income'])

# Convert income to binary
encode_income(dfTrain)
encode_income(dfTest)
dfTrain.head()

# Train and test sets
X_train = dfTrain.loc[:,dfTrain.columns !='income'].values
Y_train = dfTrain['income'].values
X_test = dfTest.loc[:,dfTest.columns !='income'].values
Y_test = dfTest['income'].values

# Machine Learning: Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

linr_clf = LogisticRegression(penalty='l2', solver='liblinear')
linr_clf.fit(X_train, Y_train)
predictions = linr_clf.predict(X_test)

print("Logistic Regression:")
print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print("Confusion matrix:\n" + str(confusion_matrix(Y_test, predictions)))
print(classification_report(Y_test, predictions))

Logistic Regression:
Accuracy: 0.8360557768924303
Confusion matrix:
[[10868   492]
 [ 1977  1723]]
              precision    recall  f1-score   support

           0       0.85      0.96      0.90     11360
           1       0.78      0.47      0.58      3700

    accuracy                           0.84     15060
   macro avg       0.81      0.71      0.74     15060
weighted avg       0.83      0.84      0.82     15060



# Machine Learning: SVM

In [12]:
# Support Vector Machine
from sklearn.svm import SVC

svm = SVC(gamma='auto')
svm.fit(X_train, Y_train)
predictions = svm.predict(X_test)

print("SVM model:")
print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print("Confusion matrix:\n" + str(confusion_matrix(Y_test, predictions)))
print(classification_report(Y_test, predictions))

SVM model:
Accuracy: 0.8391102257636123
Confusion matrix:
[[10696   664]
 [ 1759  1941]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90     11360
           1       0.75      0.52      0.62      3700

    accuracy                           0.84     15060
   macro avg       0.80      0.73      0.76     15060
weighted avg       0.83      0.84      0.83     15060



# Machine Learning: Neural Network

In [11]:
# Neural Network
from sklearn.neural_network import MLPClassifier

d = X_train.shape[1]
mlp = MLPClassifier(hidden_layer_sizes=(d,d,d,), activation='relu')
mlp.fit(X_train,Y_train)

predictions = mlp.predict(X_test)
print("=======================================================")
print("Neural Network: ")
print("Accuracy: " + str(accuracy_score(Y_test, predictions)))
print("Confusion matrix:\n" + str(confusion_matrix(Y_test, predictions)))
print(classification_report(Y_test,predictions))

Neural Network: 
Accuracy: 0.8110225763612218
Confusion matrix:
[[10239  1121]
 [ 1725  1975]]
              precision    recall  f1-score   support

           0       0.86      0.90      0.88     11360
           1       0.64      0.53      0.58      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.72      0.73     15060
weighted avg       0.80      0.81      0.81     15060

