# Model Development

In [10]:
#set up needed library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

#model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

#read in the csv data created in 00_LoadData.py
# set the ? as NA values
df = pd.read_csv("fullrecords.csv", na_values = "?")

In [2]:
# drop the id columns that were used to join with the other tables
df.drop(['workclass_id','education_level_id','marital_status_id','occupation_id',
         'relationship_id','race_id','sex_id','country_id'], axis = 1, inplace = True)

df.columns

Index(['id', 'age', 'education_num', 'capital_gain', 'capital_loss',
       'hours_week', 'over_50k', 'workclass', 'educlevel', 'maritalstatus',
       'occup', 'race', 'sex', 'country'],
      dtype='object')

### Recodes/Imputation

In [3]:
#based on the results of EDA we will need to impute the missing values and will go with the most frequent strategy
imp = SimpleImputer(missing_values = np.nan, strategy='most_frequent')

imp_df = pd.DataFrame(imp.fit_transform(df), columns = df.columns, index = df.index)

# check missing values after imputation
imp_df.isnull().sum()

id               0
age              0
education_num    0
capital_gain     0
capital_loss     0
hours_week       0
over_50k         0
workclass        0
educlevel        0
maritalstatus    0
occup            0
race             0
sex              0
country          0
dtype: int64

In [4]:
#implement some collapsing of educlevel, marital status, and workclass
#set up the new mappings
work = {
    'State-gov' : 'gov', 'Federal-gov' : 'gov', 'Local-gov' : 'gov',
    'Self-emp-not-inc' : 'self', 'Self-emp-inc' : 'self',
    'Private' : 'priv',
    'Without-pay' : 'nopay',
    'Never-worked' : 'never'
}
educ = {
    'Bachelors' : 'bach',
    'HS-grad': 'hs',
    '11th' : 'lths', '9th' : 'lths', '7th-8th': 'lths','5th-6th':'lths','10th':'lths','1st-4th':'lths','Preschool':'lths','12th':'lths',
    'Masters' : 'grad', 'Doctorate':'grad','Prof-school':'grad',
    'Some-college':'somecoll', 'Assoc-acdm':'somecoll','Assoc-voc':'somecoll'
    
}
mar = {
    'Never-married': 'single',
    'Married-civ-spouse': 'married', 'Married-spouse-absent':'married','Married-AF-spouse':'married',
    'Divorced': 'Div_sep_wid', 'Separated':'Div_sep_wid','Widowed':'Div_sep_wid'
}

#apply the mappings
imp_df['col_work'] = imp_df['workclass'].replace(work)
imp_df['col_educ'] = imp_df['educlevel'].replace(educ)
imp_df['col_mar'] = imp_df['maritalstatus'].replace(mar)

#check collapsings
collist = ['col_work','col_educ','col_mar']
for c in collist:
    display(imp_df[c].value_counts())

priv     36705
gov       6549
self      5557
nopay       21
never       10
Name: col_work, dtype: int64

hs          15784
somecoll    14540
bach         8025
lths         6408
grad         4085
Name: col_educ, dtype: int64

married        23044
single         16117
Div_sep_wid     9681
Name: col_mar, dtype: int64

In [5]:
#Now create dummy variables for all of our categorical version
fin_df = pd.get_dummies(imp_df, columns = ['occup','race','sex','col_work','col_mar'])

fin_df.columns

Index(['id', 'age', 'education_num', 'capital_gain', 'capital_loss',
       'hours_week', 'over_50k', 'workclass', 'educlevel', 'maritalstatus',
       'country', 'col_educ', 'occup_Adm-clerical', 'occup_Armed-Forces',
       'occup_Craft-repair', 'occup_Exec-managerial', 'occup_Farming-fishing',
       'occup_Handlers-cleaners', 'occup_Machine-op-inspct',
       'occup_Other-service', 'occup_Priv-house-serv', 'occup_Prof-specialty',
       'occup_Protective-serv', 'occup_Sales', 'occup_Tech-support',
       'occup_Transport-moving', 'race_Amer-Indian-Eskimo',
       'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White',
       'sex_Female', 'sex_Male', 'col_work_gov', 'col_work_never',
       'col_work_nopay', 'col_work_priv', 'col_work_self',
       'col_mar_Div_sep_wid', 'col_mar_married', 'col_mar_single'],
      dtype='object')

### Create train/validation/test split (70/20/10) 

In [6]:
#split the target and predictor variables into X and Y
# drop 'capital_gain','capital_loss', and 'country' based on EDA work
# keep `education_num` and drop other education variables given how closely they are related
X = fin_df[['age', 'education_num', 
       'hours_week', 'occup_Adm-clerical', 'occup_Armed-Forces',
       'occup_Craft-repair', 'occup_Exec-managerial', 'occup_Farming-fishing',
       'occup_Handlers-cleaners', 'occup_Machine-op-inspct',
       'occup_Other-service', 'occup_Priv-house-serv', 'occup_Prof-specialty',
       'occup_Protective-serv', 'occup_Sales', 'occup_Tech-support',
       'occup_Transport-moving', 'race_Amer-Indian-Eskimo',
       'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White',
       'sex_Female', 'sex_Male', 'col_work_gov', 'col_work_never',
       'col_work_nopay', 'col_work_priv', 'col_work_self',
       'col_mar_Div_sep_wid', 'col_mar_married', 'col_mar_single']]
y = fin_df[['over_50k']]

# First split off 10 percent for the train data and 90 for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

#further split the test data to be 80 train and 20 validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=(.2/.9), random_state=1) # 0.25 x 0.8 = 0.2

In [7]:
print('Train size: ', X_train.shape)
print('Val. size: ', X_val.shape)
print('Test size: ', X_test.shape)

Train size:  (34188, 32)
Val. size:  (9769, 32)
Test size:  (4885, 32)


### Model Development

In [8]:
# We will try the following model types: Logistic Regression, KNeighbors, LinearSVC, DecisionTreeClassifier, 
# AdaBoost, and RandomForest

# set up models with defaults
models = []
models.append(('lr', LogisticRegression()))
models.append(('knn', KNeighborsClassifier()))
models.append(('svm', LinearSVC()))
models.append(('clf',  DecisionTreeClassifier()))
models.append(('abd', AdaBoostClassifier()))
models.append(('rf', RandomForestClassifier()))

#convert to arrays and make sure target is the right data type
X_train = X_train.values
y_train = y_train.values
y_train = y_train.astype('int')

X_val = X_val.values
y_val = y_val.values
y_val = y_val.astype('int')

# set up way to save the results
results_train = []
results_val = []
names = []
#iterate through model types fitting the models and collecting scores
for name, model in models:
    model.fit(X_train, y_train)
    names.append(name)
    results_train.append(model.score(X_train, y_train))
    results_val.append(model.score(X_val, y_val))
    msg = "%s: Train - %f; Validation - %f" % (name, model.score(X_train, y_train), model.score(X_val, y_val))
    print(msg)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return self._fit(X, y)


lr: Train - 0.826460; Validation - 0.834476
knn: Train - 0.862145; Validation - 0.806531


  y = column_or_1d(y, warn=True)


svm: Train - 0.759506; Validation - 0.763026
clf: Train - 0.957236; Validation - 0.788822


  y = column_or_1d(y, warn=True)


abd: Train - 0.831929; Validation - 0.838980




rf: Train - 0.957236; Validation - 0.810318


The Decision tree and random forest classifier both overfit the train set, which can be see from the difference between the train and test scores.  The knn classifier also suffers from overfitting. Between the remaining three models both the AdaBoost classifier and logistic regression outperformed the LinearSVC. The results across the two was close but we'll go forward with the logistic regression since it is a simpler model and the coefficients could provide important insights.

## Score on the test data

In [17]:
# convert to arrays
X_test = X_test.values
y_test = y_test.values
y_test = y_test.astype('int')

lr = LogisticRegression()
lr.fit(X_train, y_train)

print("Test score: ", lr.score(X_test,y_test))

  y = column_or_1d(y, warn=True)


Test score:  0.8294779938587513


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# also print the coefficients for write-up
coef_table = pd.DataFrame(list(X.columns)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",lr.coef_.transpose())
coef_table

Unnamed: 0,0,Coefs
0,age,0.026768
1,education_num,0.321589
2,hours_week,0.031274
3,occup_Adm-clerical,-0.140689
4,occup_Armed-Forces,0.004667
5,occup_Craft-repair,-0.142995
6,occup_Exec-managerial,0.577172
7,occup_Farming-fishing,-1.119693
8,occup_Handlers-cleaners,-0.84915
9,occup_Machine-op-inspct,-0.611708
