# Modeling

Time to model our data, we're going to be using a lot of different classifiers and trying to find which ones will be the best

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, f1_score

## Reading in the data

In [None]:
# Reading in data with dummies and Lasso CV
dummy_income = pd.read_csv('./data/dummy_income.csv')

In [None]:
# Selecting Features
features = income._get_numeric_data().columns.drop(['wage', 'smpl_wgt', 'fnlwgt', 'log_age'])
X = income[features]
y = income['wage']

## Our baseline model

In [None]:
y.value_counts(normalize=True)

In [None]:
def model_score_classification(X, y, models: list):
    # Split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    
    
    # Creating empty df to add to later
    models_df = pd.DataFrame(columns=['model', 
                                      'parameters', 
                                      'train_accuracy',
                                      'train_f1',
                                      'train_spec',
                                      'train_sens',
                                      'test_accuracy',
                                      'test_f1',
                                      'test_spec',
                                      'test_sens'])
    
    for model in models:
        # Create a pipeline to scale data and pass through model
        pipe = Pipeline([
            ('sc', StandardScaler()),
            ('model', model) # Thanks Lisa Tagliaferri from Digitalocean.com https://www.digitalocean.com/community/tutorials/how-to-use-args-and-kwargs-in-python-3
        ])

        # Fitting the model
        pipe.fit(X_train, y_train)
        
        # Predictions
        y_train_preds = pipe.predict(X_train)
        y_test_preds = pipe.predict(X_test)
        
        # Scoring the models
        train_score = pipe.score(X_train, y_train)
        train_f1 = f1_score(y_train, y_train_preds)
        test_score = pipe.score(X_test, y_test)
        test_f1 = f1_score(y_test, y_test_preds)
        
        # Calculate train specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_train, pipe.predict(X_train)).ravel()
        train_spec = tn / (tn + fp)
        train_sens = tp / (tp + fn)
        
        # Calculate test specificity and sensitivity
        tn, fn, fp, tp = confusion_matrix(y_test, pipe.predict(X_test)).ravel()
        test_spec = tn / (tn + fp)
        test_sens = tp / (tp + fn)
        
        # Returning a dictionary of the information
        model_row = {'model' : type(model).__name__, # Thanks Jonathan from Stack Overflow! https://stackoverflow.com/questions/52763325/how-to-obtain-only-the-name-of-a-models-object-in-scikitlearn
                     'parameters' : model.get_params(),
                     'train_accuracy' : train_score,
                     'train_f1' : train_f1,
                     'train_spec' : train_spec,
                     'train_sens' : train_sens,
                     'test_accuracy': test_score,
                     'test_f1': test_f1,
                     'test_spec' : test_spec,
                     'test_sens' : test_sens}
        
        # Add new row to models_df
        models_df = models_df.append(model_row, ignore_index=True)
        
        print(f'Done with {model}, moving on')
        
    return models_df

In [None]:
classification_models = [LogisticRegression(n_jobs=12),
                        DecisionTreeClassifier(max_depth=6),
                        BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth=6), 
                                          n_estimators=500, 
                                          n_jobs=12),
                        RandomForestClassifier(max_depth=6, 
                                               n_estimators=1000, 
                                               n_jobs=12, 
                                               random_state=42),
                        AdaBoostClassifier(n_estimators=350, 
                                           random_state=42),
                        VotingClassifier([
                                        ('logreg', LogisticRegression(n_jobs=12)),
                                         ('dt', DecisionTreeClassifier(max_depth=6)),
                                         ('bc', BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth=6), 
                                                           n_estimators=500, 
                                                           n_jobs=12)),
                                        ('rfc', RandomForestClassifier(max_depth=6, 
                                                               n_estimators=1000, 
                                                               n_jobs=12, 
                                                               random_state=42)),
                                        ('ab', AdaBoostClassifier(n_estimators=500, 
                                                           random_state=42))]),                                        
                        SVC(C=0, random_state=42)
                        ]

The accuracy of our baseline model is ~76%

In [None]:
gen_model_df = model_score_classification(X, y, classification_models)

In [None]:
gen_model_df

Looks like all these models perform fairly well before GridSearching, so let's break these up and use `GridSearchCV` to find the best ones!

In [None]:
gen_model_df.to_csv('./data/general_models_dummy_summary.csv')

## Gridsearching through `AdaBoostClassifier`, `SVC`, and `GradientNB`

Thanks Eric Heidbreder!

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Creating AdaBoost Pipeline
pipe_ab = Pipeline([
    ('sc', StandardScaler()),
    ('ab', AdaBoostClassifier()),
])

In [None]:
# Creating SVC Pipeline
pipe_svc = Pipeline([
    ('sc', StandardScaler()),
    ('svc', SVC())
])

In [None]:
# Gaussian Naive Bayes
pipe_gb = Pipeline([
    ('sc', StandardScaler()),
    ('gb', GaussianNB())
])

In [None]:
# Creating GaussianNB params
params_gb = {}

In [None]:
# Creating AdaBoost Params
params_ab = {
    'ab__n_estimators' : [2500, 3000],
    'ab__random_state' : [42]
}

In [None]:
# Creating SVC Params
params_svc = {
    'svc__C': [10],
    'svc__degree': [2]
}

In [None]:
# Creating AdaBoost GridSearch
grid_ab = GridSearchCV(pipe_ab, params_ab, cv=5, verbose=2, n_jobs=-1)

In [None]:
# Creating SVC GridSearch
grid_svc = GridSearchCV(pipe_svc, params_svc, cv=5, verbose=2, n_jobs=-1)

In [None]:
# Creating GaussianNB GridSearch
grid_gb = GridSearchCV(pipe_gb,
                     param_grid = params_gb,
                     cv = 5,
                     verbose = 1,
                     scoring = 'accuracy')

In [None]:
# # Commented out so the .csv doesn't get overwritten
# model_params = {}
# count = 0

In [None]:
# Uncomment if you really want to run this GridSearch again, it will take awhile
grid_ab.fit(X_train, y_train)
grid_svc.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)

# Create a new dictionary entry with the best params used in the GridSearch Pipeline
grid_ab.best_params_['best_params'] = grid_ab.best_params_
grid_svc.best_params_['best_params'] = grid_svc.best_params_
grid_gb.best_params_['best_params'] = grid_gb.best_params_

# Create a new dictionary entry with the model used in the GridSearch Pipeline
grid_ab.best_params_['model'] = grid_ab.estimator[1]
grid_svc.best_params_['model'] = grid_svc.estimator[1]
grid_gb.best_params_['model'] = grid_gb.estimator[1]

# Create a new dictionary entry with the cv score from the GridSearch
grid_ab.best_params_['cv_score'] = grid_ab.best_score_
grid_svc.best_params_['cv_score'] = grid_svc.best_score_
grid_gb.best_params_['cv_score'] = grid_gb.best_score_

# Create a new dictionary entry with the train score from the GridSearch
grid_ab.best_params_['train_score'] = grid_ab.score(X_train, y_train)
grid_svc.best_params_['train_score'] = grid_svc.score(X_train, y_train)
grid_gb.best_params_['train_score'] = grid_gb.score(X_train, y_train)

# Create a new dictionary entry with the test score from the GridSearch
grid_ab.best_params_['test_score'] = grid_ab.score(X_test, y_test)
grid_svc.best_params_['test_score'] = grid_svc.score(X_test, y_test)
grid_gb.best_params_['test_score'] = grid_gb.score(X_test, y_test)

# Add each of these entries to the list
count += 1
model_params[f'model_{count}'] = grid_ab.best_params_
count += 1
model_params[f'model_{count}'] = grid_svc.best_params_
count += 1
model_params[f'model_{count}'] = grid_gb.best_params_

# Create a DataFrame from the dictionary we created above
model_df = pd.DataFrame.from_dict(model_params, orient='index')

In [None]:
model_df

## GridSearching through `DecisionTreeClassifier` 

Thanks Irene Anibogwu!

In [None]:
# grid search for decision trees to find best estimator and params
gridcv = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [3, 5, 7, 10],
                                  'min_samples_split': [5, 10, 15, 20],
                                  'min_samples_leaf': [2, 3, 4, 5, 6, 7]},
                    cv = 5,
                    verbose = 1,
                    n_jobs=-1)
gridcv.fit(X_train, y_train)
gridcv.best_estimator_
gridcv.best_params_

In [None]:
# Instantiate model w/ best parameters.
dt = DecisionTreeClassifier(max_depth = 7,
                            min_samples_split = 20,
                            min_samples_leaf = 4,
                            random_state = 42)
# Fit model.
dt.fit(X_train, y_train)
# Evaluate model.
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

## GridSearching through `RandomForestClassifier`

Thanks Josh Mizraji!

In [None]:
model_params = {}
count = 0

In [None]:
#Scaffolding
params_rf = {
    'n_estimators' : [55,60,70], #number of trees
    'max_features' : [None], 
    'max_depth' : [7,8,9]
}
#Instantiate Gridsearch
gs_rf = GridSearchCV(RandomForestClassifier(), 
                 param_grid=params_rf,
                 cv=5,
                n_jobs=-1)
#Fit
gs_rf.fit(X_train, y_train)
#this takes the best params dictionary and adds a column called score
gs_rf.best_params_['score'] = gs_rf.best_score_
#make a counter
count +=1
#create new column with best params
model_params[f'model_{count}'] = gs_rf.best_params_
#orient sideways
model_df = pd.DataFrame.from_dict(model_params, orient='index')
model_df
#adapted from DSI lesson 

In [None]:
vc = VotingClassifier([
    ('ab', AdaBoostClassifier(n_estimators=2500, random_state=42)),
#     ('SVC', SVC(C=10, degree=2)),
    ('bag', BaggingClassifier(n_estimators=2000,
                             max_samples=300,
                             max_features=len(features))),
    ('rf', RandomForestClassifier(max_depth=9,
                                 n_estimators=70)),
    ('dt', DecisionTreeClassifier(max_depth = 7,
                                  min_samples_split = 20,
                                  min_samples_leaf = 4,
                                  random_state = 42)),
], n_jobs=12)

## Running a Voting Classifier on our best models as determined by GridSearchCV

In [None]:
vc.fit(X_train, y_train)

In [None]:
vc.score(X_train, y_train)

In [None]:
vc.score(X_test, y_test)

## Prepping test data

In [None]:
# Reading in test data
income_test = pd.read_csv('./data/test_dummied.csv')

In [None]:
#Clean test data
test_data = pd.read_csv('./data/test_data.csv')

# replace the ? with 'unknown'
test_data.replace(' ?', "unknown", inplace = True )
cat_columns_test = test_data.drop(columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], axis = 1).columns

# Stripping whitespace from beginning of each value
for column in cat_columns_test:
    test_data[column] = test_data[column].apply(lambda x: x.strip())
    
#Save as a new csv
test_data.to_csv('./data/test_cleaned.csv',index=False)

## Getting predictions for Testing Data

In [None]:
# Selecting Features
features = [column for column in dummy_income.columns if column in income_test.columns] # Selects only the columns that are in income_test
X = dummy_income[features]
y = income['wage']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
ab = AdaBoostClassifier(n_estimators=2500, random_state=42)

In [None]:
ab.fit(X, y) # Training the model on the full dataset

In [None]:
preds = ab.predict(income_test[features])

In [None]:
preds.shape

In [None]:
preds_df = pd.DataFrame(preds, columns=['wage'])

In [None]:
preds_df.to_csv('./data/submission.csv', index=False)