# Adult Income - Classification Modeling
---
__Date Created:__ 2020-06-20  
__Author:__ Josh Mischung  
__email:__ josh@knoasis.io  

__Dataset:__ [Adult Income, UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Adult)

__Comments on Dataset & Objectives:__  
The objective of this notebook is building a classification model that can power a user-facing frontend where inputs are provide by the user and the probability of an outcome is predicted.  

The Adult dataset from the UCI Machine Learning Repository was chosen for this task because a layperson will intuitively have enough understanding of the inputs to change them and mentally form their own hypothesis about how the change will impact the probability of the outcome (*adult income >$50K*).

<br>

# Imports & Functions
---

__Imports__

In [1]:
"""
Supress FutureWarnings generated by imblearn using
sklearn method locations that will be deprecated
"""
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [43]:
from collections import Counter
from numpy import mean
from numpy import std
from pandas import read_pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline as skl_pipe
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from category_encoders.woe import WOEEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

set_config(display='diagram')

__Functions__

In [86]:
# Evaluate model
def evaluate_model(X, y, model):
    """
    *Reference sklearn.metrics.SCORERS.keys() for list of available scorers*
    """
    # Define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=143)
    # Evaluate model
    scores = cross_validate(model, X, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=cv, n_jobs=-1)
    return scores

def split_encode(feature_df, target):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(feature_df, target, test_size=0.1, stratify=y, random_state=142)

    # Weigth of Evidence
    woe = WOEEncoder(regularization=1e-8, random_state=143)
    woe.fit(X_train.select_dtypes('category'), y_train)

    # Encode X_train categories
    X_train = X_train.copy()
    X_train.loc[:, X_train.select_dtypes('category').columns] = woe.transform(X_train.select_dtypes('category'))

    # Encode X_test categories
    X_test = X_test.copy()
    X_test.loc[:, X_test.select_dtypes('category').columns] = woe.transform(X_test.select_dtypes('category'))
    
    return X_train, X_test, y_train, y_test

<br>  
# Load Data
---

In [4]:
# Load data
data = 'adult_df_postEDA.pkl'
adult_df = read_pickle(data)
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,workclass_null,occupation_null,native_country_null,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,0,0,0


In [5]:
# Metadata
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 18 columns):
age                    32561 non-null int64
workclass              32561 non-null category
fnlwgt                 32561 non-null int64
education              32561 non-null category
education_num          32561 non-null int64
marital_status         32561 non-null category
occupation             32561 non-null category
relationship           32561 non-null category
race                   32561 non-null category
sex                    32561 non-null category
capital_gain           32561 non-null int64
capital_loss           32561 non-null int64
hours_per_week         32561 non-null int64
native_country         32561 non-null category
workclass_null         32561 non-null int64
occupation_null        32561 non-null int64
native_country_null    32561 non-null int64
income                 32561 non-null category
dtypes: category(9), int64(9)
memory usage: 2.5 MB


<br>  
# Generate Baseline & Process Data
---

__Generate Baseline__

In [89]:
# Separate features, target
X = adult_df.iloc[:, :-1]
y = adult_df.iloc[:, -1]

# Instantiate baseline model
model_dummy = DummyClassifier(strategy="most_frequent")

# Evaluate model performance
scores = evaluate_model(X, y, model_dummy)
model = "Baseline Model (Most Frequent)"
print(f"{model}\n{'-' * len(model)}")
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores['test_accuracy']), std(scores['test_accuracy'])))
print('Mean ROC-AUC: %.3f (%.3f)' % (mean(scores['test_roc_auc']), std(scores['test_roc_auc'])))

Baseline Model (Most Frequent)
------------------------------
Mean Accuracy: 0.759 (0.000)
Mean ROC-AUC: 0.500 (0.000)


__Process Data__

In [83]:
X_train, X_test, y_train, y_test = split_encode(X, y)

__Build Pipeline__

In [84]:
# Assemble classifer pipeline
clf = Pipeline(steps=[
    ('smote', SMOTE(random_state=143)),
    ('scaler', RobustScaler()),
    ('classifier', LogisticRegression())])

In [85]:
evaluate_model(X_train, y_train, clf)

{'fit_time': array([1.50220299, 1.4539659 , 1.46056604, 1.51839614, 1.56296301,
        1.55356121, 1.52408719, 1.54536986, 1.49326897, 1.41714811,
        1.45127702, 1.457021  , 1.40659118, 1.42462397, 1.40748477,
        1.42048907, 1.48476219, 1.47210598, 1.43554115, 1.50587583,
        1.47035694, 1.4998107 , 1.41197896, 1.41333079, 1.16214466,
        1.08829284, 1.13413668, 1.10118914, 1.07729292, 1.06328511]),
 'score_time': array([0.06064892, 0.040169  , 0.0446887 , 0.05021191, 0.03471208,
        0.03524399, 0.0395329 , 0.03686523, 0.03385091, 0.03364086,
        0.03266597, 0.03118992, 0.03356695, 0.05537605, 0.03763819,
        0.02682424, 0.03733206, 0.03670502, 0.04057384, 0.05016518,
        0.04972315, 0.02951908, 0.03780603, 0.04627514, 0.02554226,
        0.02647924, 0.02437115, 0.01935482, 0.02042627, 0.01817989]),
 'test_accuracy': array([0.82429205, 0.8041624 , 0.81132719, 0.80143296, 0.8109215 ,
        0.80750853, 0.79488055, 0.8       , 0.80136519, 0.81535836,
 

### *Resume at "Evaluate Models"...*