In [47]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
#allows all columns to be scrolled
pd.set_option('display.max_columns', None)

# Source:https://stackoverflow.com/questions/47022070/display-all-dataframe-columns-in-a-jupyter-python-notebook/47022213 

## Read in data


In [3]:
income = pd.read_csv('./data/train_cleaned.csv')
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,wage
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,Cuba,0


#### Columns to run
`Age`, `workclass`, `occupation`, `edu_num`, `hours per week`

# Feature Engineering

### Target Encoding

In [4]:
# target encode the following columns
encode_columns = ['workclass', 'education', 'marital-status', 'occupation', 'native-country']

for col in encode_columns:
    encoder = TargetEncoder()
    income[f'{col}_encode'] = encoder.fit_transform(income[col], income['wage'])

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


### Binarize `sex`

In [5]:
# Binarize the sex column
income['sex'] = np.where(income['sex']=='Male', 1, 0)

## Polynomial Features

### Sample Weight column

In [6]:
# Creating a sample weight column
income['smpl_wgt'] = income['fnlwgt'].apply(lambda x: x / income['fnlwgt'].sum())

In [7]:
features = ['age', 'hours-per-week', 'education-num']

X = income[features]                              
poly = PolynomialFeatures(include_bias=False)                      
X_poly = poly.fit_transform(X)       

poly_features = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))
poly_features.head(3)

Unnamed: 0,age,hours-per-week,education-num,age^2,age hours-per-week,age education-num,hours-per-week^2,hours-per-week education-num,education-num^2
0,39.0,40.0,13.0,1521.0,1560.0,507.0,1600.0,520.0,169.0
1,50.0,13.0,13.0,2500.0,650.0,650.0,169.0,169.0,169.0
2,38.0,40.0,9.0,1444.0,1520.0,342.0,1600.0,360.0,81.0


In [8]:
#join
income = income.join(poly_features.drop(columns = ['age', 'hours-per-week', 'education-num'], axis = 1), how = 'left')

In [12]:
income_final = income._get_numeric_data()

In [13]:
# Log age
income_final['log_age'] = np.log(income_final['age'])

In [14]:
income_final.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,wage,workclass_encode,education_encode,marital-status_encode,occupation_encode,native-country_encode,smpl_wgt,age^2,age hours-per-week,age education-num,hours-per-week^2,hours-per-week education-num,education-num^2,log_age
0,39,77516,13,1,2174,0,40,0,0.271957,0.414753,0.045961,0.134483,0.245835,1.3e-05,1521.0,1560.0,507.0,1600.0,520.0,169.0,3.663562
1,50,83311,13,1,0,0,13,0,0.284927,0.414753,0.446848,0.484014,0.245835,1.3e-05,2500.0,650.0,650.0,169.0,169.0,169.0,3.912023
2,38,215646,9,1,0,0,40,0,0.218673,0.159509,0.104209,0.062774,0.245835,3.5e-05,1444.0,1520.0,342.0,1600.0,360.0,81.0,3.637586
3,53,234721,7,1,0,0,40,0,0.218673,0.051064,0.446848,0.062774,0.245835,3.8e-05,2809.0,2120.0,371.0,1600.0,280.0,49.0,3.970292
4,28,338409,13,0,0,0,40,0,0.218673,0.414753,0.446848,0.449034,0.263158,5.5e-05,784.0,1120.0,364.0,1600.0,520.0,169.0,3.332205


In [22]:
X = income_final.drop(columns = ['wage'])
y= income_final['wage']

In [24]:
def model_score_classification(X, y, models: list):
    # Split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    # Creating empty df to add to later
    models_df = pd.DataFrame(columns=['model', 
                                      'parameters', 
                                      'train_accuracy',
                                      'train_f1',
                                      'train_spec',
                                      'train_sens',
                                      'test_accuracy',
                                      'test_f1',
                                      'test_spec',
                                      'test_sens'])
    for model in models:
        # Create a pipeline to scale data and pass through model
        pipe = Pipeline([
            ('sc', StandardScaler()),
            ('model', model) # Thanks Lisa Tagliaferri from Digitalocean.com https://www.digitalocean.com/community/tutorials/how-to-use-args-and-kwargs-in-python-3
        ])
        # Fitting the model
        pipe.fit(X_train, y_train)
        # Predictions
        y_train_preds = pipe.predict(X_train)
        y_test_preds = pipe.predict(X_test)
        # Scoring the models
        train_score = pipe.score(X_train, y_train)
        train_f1 = f1_score(y_train, y_train_preds)
        test_score = pipe.score(X_test, y_test)
        test_f1 = f1_score(y_test, y_test_preds)
        # Calculate train specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_train, pipe.predict(X_train)).ravel()
        train_spec = tn / (tn + fp)
        train_sens = tp / (tp + fn)
        # Calculate test specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_test, pipe.predict(X_test)).ravel()
        test_spec = tn / (tn + fp)
        test_sens = tp / (tp + fn)
        # Returning a dictionary of the information
        model_row = {'model' : type(model).__name__, # Thanks Jonathan from Stack Overflow! https://stackoverflow.com/questions/52763325/how-to-obtain-only-the-name-of-a-models-object-in-scikitlearn
                     'parameters' : model.get_params(),
                     'train_accuracy' : train_score,
                     'train_f1' : train_f1,
                     'train_spec' : train_spec,
                     'train_sens' : train_sens,
                     'test_accuracy': test_score,
                     'test_f1': test_f1,
                     'test_spec' : test_spec,
                     'test_sens' : test_sens}
        # Add new row to models_df
        models_df = models_df.append(model_row, ignore_index=True)
    return models_df

In [25]:
classification_models = [LogisticRegression(n_jobs=12),
                        KNeighborsClassifier(n_neighbors=round(np.sqrt(income_final.shape[0])), n_jobs=12),
                        DecisionTreeClassifier(max_depth=6),
                        BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth=6), n_estimators=100, n_jobs=12),
                        RandomForestClassifier(max_depth=6, n_estimators=250, n_jobs=12, random_state=42),
                        AdaBoostClassifier(n_estimators=250, random_state=42),
                        SVC(C=20, random_state=42)]

In [58]:
model_score_classification(X, y, models=[DecisionTreeClassifier(random_state=42)])

Unnamed: 0,model,parameters,train_accuracy,train_f1,train_spec,train_sens,test_accuracy,test_f1,test_spec,test_sens
0,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.999918,0.99983,0.999892,1.0,0.809974,0.611599,0.878719,0.602076


In [59]:
model_score_classification(X, y, models=[RandomForestClassifier(random_state=42)])

Unnamed: 0,model,parameters,train_accuracy,train_f1,train_spec,train_sens,test_accuracy,test_f1,test_spec,test_sens
0,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999918,0.99983,0.999892,1.0,0.854686,0.677744,0.888647,0.72706


In [95]:
lr_pipe = Pipeline([ 
    ('sc', StandardScaler()),
    ('lr' , LogisticRegression(solver = 'liblinear')),  
])
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.816953316953317
0.8196781722147156


In [88]:
lr_pipe = Pipeline([ 
    ('sc', StandardScaler()),
    ('lr', LogisticRegression(solver = 'liblinear')),
    'n_jobs'])

params = {
    'lr__alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    ''
}
# gs_nb = GridSearchCV(pipe_nb, 
#                   param_grid=params_nb, 
#                   cv = 5) # 5-fold cross-validation.

gridcv = GridSearchCV(lr_pipe,
                      param_grid = params,
                    cv = 3,
                    verbose = 1)

In [90]:
gridcv = GridSearchCV(lr_pipe,
                      param_grid = params,
                    cv = 3,
                    verbose = 1)

In [91]:
lr = LogisticRegression(solver = 'liblinear', n_jobs = 2)

In [71]:
# lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
# lr.score(X_train, y_train)

0.816953316953317

In [46]:
# lr.score(X_test, y_test)

0.8196781722147156

In [49]:
print(f'Logistic Regression predicted values: {lr.predict(X_train.head())}')

Logistic Regression predicted values: [0 0 0 0 0]


## grid search

In [52]:
pipe = Pipeline([
    ('lr', LogisticRegression(solver = 'liblinear'))
    ])

In [54]:
pipe_params = {
    'cv__max_features': [20],
    'cv__min_df': [1, 2],
    'cv__max_df': [.9, .95],
    'cv__ngram_range': [(1,1), (1,2)]
}

In [55]:
gridsearch= GridSearchCV(pipe,
                        param_grid = pipe_params, cv = 5)

In [None]:
pipe = Pipeline([
    ('lr', LogisticRegression(solver = 'liblinear'))
    ])

In [64]:
dt = DecisionTreeClassifier(max_depth = 5,
                            min_samples_split = 5,
                            min_samples_leaf = 3,
                            random_state = 42)

In [65]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [66]:
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.8527436527436527
Score on testing set: 0.8525979609384596


In [78]:
gridcv = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1)

In [72]:
gridcv.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3, 5, 7, 10],
                         

In [73]:
gridcv.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=7, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [74]:
gridcv.best_params_

{'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 20}

In [75]:
# Instantiate model with best parameters.
dt = DecisionTreeClassifier(max_depth = 7,
                            min_samples_split = 20,
                            min_samples_leaf = 4,
                            random_state = 42)

# Fit model.
dt.fit(X_train, y_train)

# Evaluate model.
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.8613022113022113
Score on testing set: 0.8593538877287803


In [79]:
# lc = lasso_coef.iloc[lasso_coef.coef.abs().argsort()][::-1]
# plt.figure(figsize = (10, 10))
# sns.barplot(x = 'coef', y = 'column', data = lc, orient = 'h')
# plt.xlabel('Lasso Coefficients', size = 13)
# plt.ylabel('Columns', size = 13);