# Model Development

In [9]:
#set up needed library imports
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

#read in the csv data created in 00_LoadData.py
# set the ? as NA values
df = pd.read_csv("fullrecords.csv", na_values = "?")

In [10]:
# drop the id columns that were used to join with the other tables
df.drop(['workclass_id','education_level_id','marital_status_id','occupation_id',
         'relationship_id','race_id','sex_id','country_id'], axis = 1, inplace = True)

df.columns

Index(['id', 'age', 'education_num', 'capital_gain', 'capital_loss',
       'hours_week', 'over_50k', 'workclass', 'educlevel', 'maritalstatus',
       'occup', 'race', 'sex', 'country'],
      dtype='object')

### Recodes/Imputation

In [11]:
#based on the results of EDA we will need to impute the missing values and will go with the most frequent strategy
imp = SimpleImputer(missing_values = np.nan, strategy='most_frequent')

imp_df = pd.DataFrame(imp.fit_transform(df), columns = df.columns, index = df.index)

# check missing values after imputation
imp_df.isnull().sum()

id               0
age              0
education_num    0
capital_gain     0
capital_loss     0
hours_week       0
over_50k         0
workclass        0
educlevel        0
maritalstatus    0
occup            0
race             0
sex              0
country          0
dtype: int64

In [12]:
#implement some collapsing of educlevel, marital status, and workclass
#set up the new mappings
work = {
    'State-gov' : 'gov', 'Federal-gov' : 'gov', 'Local-gov' : 'gov',
    'Self-emp-not-inc' : 'self', 'Self-emp-inc' : 'self',
    'Private' : 'priv',
    'Without-pay' : 'nopay',
    'Never-worked' : 'never'
}
educ = {
    'Bachelors' : 'bach',
    'HS-grad': 'hs',
    '11th' : 'lths', '9th' : 'lths', '7th-8th': 'lths','5th-6th':'lths','10th':'lths','1st-4th':'lths','Preschool':'lths','12th':'lths',
    'Masters' : 'grad', 'Doctorate':'grad','Prof-school':'grad',
    'Some-college':'somecoll', 'Assoc-acdm':'somecoll','Assoc-voc':'somecoll'
    
}
mar = {
    'Never-married': 'single',
    'Married-civ-spouse': 'married', 'Married-spouse-absent':'married','Married-AF-spouse':'married',
    'Divorced': 'Div_sep_wid', 'Separated':'Div_sep_wid','Widowed':'Div_sep_wid'
}

#apply the mappings
imp_df['col_work'] = imp_df['workclass'].replace(work)
imp_df['col_educ'] = imp_df['educlevel'].replace(educ)
imp_df['col_mar'] = imp_df['maritalstatus'].replace(mar)

#check collapsings
collist = ['col_work','col_educ','col_mar']
for c in collist:
    display(imp_df[c].value_counts())

priv     36705
gov       6549
self      5557
nopay       21
never       10
Name: col_work, dtype: int64

hs          15784
somecoll    14540
bach         8025
lths         6408
grad         4085
Name: col_educ, dtype: int64

married        23044
single         16117
Div_sep_wid     9681
Name: col_mar, dtype: int64

### Create train/validation/test split (70/20/10) 

In [21]:
#split the target and predictor variables into X and Y
# drop 'capital_gain','capital_loss', and 'country' based on EDA work
X = imp_df[['id','age','education_num',
       'hours_week', 'workclass', 'educlevel', 'maritalstatus',
       'occup', 'race', 'sex', 'col_work', 'col_educ', 'col_mar']]
y = imp_df[['id','over_50k']]

# First split off 10 percent for the train data and 90 for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

#further split the test data to be 80 train and 20 validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=(.2/.9), random_state=1) # 0.25 x 0.8 = 0.2

In [22]:
print('Train size: ', X_train.shape)
print('Val. size: ', X_val.shape)
print('Test size: ', X_test.shape)

Train size:  (34188, 13)
Val. size:  (9769, 13)
Test size:  (4885, 13)


### Model Development