In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
colnames = [
    "age"
    ,"workclass"
    ,"fnlwgt"
    ,"education"
    ,"education-num"
    ,"marital-status"
    ,"occupation"
    ,"relationship"
    ,"race"
    ,"sex"
    ,"capital-gain"
    ,"capital-loss"
    ,"hours-per-week"
    ,"native-country"
    ,"income"
]
df = pd.read_csv("adult.data", names=colnames)

In [3]:
print(df.shape)
df.head()

(32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.isnull().sum()
# missing is just recorded as "?"

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [5]:
df["income_num"] = [0.0 if y==" <=50K" else 1.0 for y in df["income"]]

In [6]:
print(df["income_num"].value_counts())
print(df["income"].value_counts())
df.head()

0.0    24720
1.0     7841
Name: income_num, dtype: int64
 <=50K    24720
 >50K      7841
Name: income, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_num
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0


In [49]:
prof = df.profile_report()
prof.to_file(output_file="profile.html")

In [7]:
# to numeric features

input_features = [
    'age'
    ,'workclass'
    ,'fnlwgt'
    ,'education'
    ,'education-num'
    ,'marital-status'
    ,'occupation'
    ,'relationship'
    ,'race'
    ,'sex'
    ,'capital-gain'
    ,'capital-loss'
    ,'hours-per-week'
    ,'native-country'
]


categorical_features = [
    "workclass"
    ,"education"
    ,"marital-status"
    ,"occupation"
    ,"relationship"
    ,"race"
    ,"sex"
    ,"native-country"
]

X1 = df[categorical_features]

onehot = OneHotEncoder()
X1_trans = onehot.fit_transform(X1)


numeric_features = [i for i in input_features if i not in categorical_features]
X2 = df[numeric_features].values

X = np.concatenate((X1_trans.todense(), X2), axis=1)


output_variable = "income"
Y = df[output_variable].values

print(X.shape, Y.shape)

(32561, 108) (32561,)


In [8]:
seed = 87
X_tr, X_te, Y_tr, Y_te = train_test_split(X, Y, test_size=0.1, random_state=seed)
print(X_tr.shape, Y_tr.shape, X_te.shape, Y_te.shape)

(29304, 108) (29304,) (3257, 108) (3257,)


In [9]:
X_tr

matrix([[ 0.,  0.,  0., ...,  0.,  0., 48.],
        [ 1.,  0.,  0., ...,  0.,  0., 40.],
        [ 0.,  0.,  0., ...,  0.,  0., 80.],
        ...,
        [ 0.,  0.,  0., ...,  0.,  0., 40.],
        [ 0.,  0.,  0., ...,  0.,  0., 60.],
        [ 0.,  0.,  0., ...,  0.,  0., 40.]])

In [10]:
# logistic regression
model_logit = LogisticRegression(penalty="l2", C=1.0, solver="lbfgs")
params_logit = {
    "penalty": ("l2", "l1")
    ,"C": (0.0001, 0.001, 0.01, 0.1, 1.0, 10.0)
}

gs_logit = GridSearchCV(model_logit, params_logit, cv=3, scoring="accuracy")
gs_logit.fit(X_tr, Y_tr)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (0.0001, 0.001, 0.01, 0.1, 1.0, 10.0),
                         'penalty': ('l2', 'l1')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [11]:
gs_logit.best_score_, gs_logit.best_params_

(0.851931476931477, {'C': 1.0, 'penalty': 'l1'})

In [12]:
accuracy_score(Y_te, gs_logit.predict(X_te))

0.8526251151366288

In [13]:
# support vector machine, ommitted due to time

# model_svm = SVC()

# params_svm = {
#     "C": (0.0001, 0.001, 0.01, 0.1, 1.0, 10.0)
#     ,"gamma": ("scale",)
# }

# gs_svm = GridSearchCV(model_svm, params_svm, cv=3, scoring="accuracy", verbose=2, njobs=3)
# gs_svm.fit(X_tr, Y_tr)

In [14]:
# model_svm.fit(X_tr, Y_tr)

In [31]:
# make an API to predict from data frame

def predict_from_df(df, model):
    X1 = df[categorical_features]
    X1_trans = onehot.transform(X1)
    numeric_features = [i for i in input_features if i not in categorical_features]
    X2 = df[numeric_features].values

    X = np.concatenate((X1_trans.todense(), X2), axis=1)
    pred = model.predict(X)
    prob = model.predict_proba(X)
    return pred, prob

In [32]:
pred, prob = predict_from_df(df, gs_logit.best_estimator_)

In [33]:
pd.crosstab(pred, df["income"])

income,<=50K,>50K
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,23021,3080
>50K,1699,4761


In [39]:
# simple error analysis
flag = [r for r, (i, p) in enumerate(zip(df["income"], pred)) if i != p]

df["prediction"] = pred

In [40]:
df.iloc[flag]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_num,prediction
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0.0,>50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0.0,>50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,1.0,<=50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K,1.0,<=50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32536,34,Private,160216,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,55,United-States,>50K,1.0,<=50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K,1.0,<=50K
32548,65,Self-emp-not-inc,99359,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,1086,0,60,United-States,<=50K,0.0,>50K
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0.0,>50K
