<h1 style="font-size:42px; text-align:center; margin-bottom:30px;"><span style="color:SteelBlue"></span> Model Training</h1>
<hr>




### Let's import libraries, recruit models, and load the ABT.



In [1]:
# NumPy for numerical computing
import numpy as np
# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns
# Scikit-Learn for Modeling
import sklearn
# Pickle for saving model files
import pickle 

In [2]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [3]:
# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Classification metrics 
from sklearn.metrics import auc, confusion_matrix, roc_curve, cohen_kappa_score

In [4]:
# Load analytical base table from Module 2
df = pd.read_csv('analytical-base-table.csv')

<span id="split"></span>
# Split dataset



<br>


In [5]:
# Create separate object for target variable
y = df.status

# Create separate object for input features
X = df.drop('status', axis = 1)

In [6]:
# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234, stratify = df.status)

# Print number of observations in X_train, X_test, y_train, and y_test
for dataframe in (X_train, X_test, y_train, y_test):
    print(dataframe.shape)

(11254, 25)
(2814, 25)
(11254,)
(2814,)


<span id="pipelines"></span>
# Build model pipelines

Next, let's set up preprocessing pipelines for each of our algorithms.



In [7]:
# Pipeline dictionary
pipelines = {
    'l1': make_pipeline(StandardScaler(), LogisticRegression(penalty = 'l1', random_state = 123)),
    'l2': make_pipeline(StandardScaler(), LogisticRegression(penalty = 'l2', random_state = 123)),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state = 123)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state = 123)),
}

<span id="hyperparameters"></span>
# Declare hyperparameters to tune



In [8]:
# List tuneable hyperparameters of our Logistic pipeline
pipelines['gb'].get_params()

{'gradientboostingclassifier': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               presort='auto', random_state=123, subsample=1.0, verbose=0,
               warm_start=False),
 'gradientboostingclassifier__criterion': 'friedman_mse',
 'gradientboostingclassifier__init': None,
 'gradientboostingclassifier__learning_rate': 0.1,
 'gradientboostingclassifier__loss': 'deviance',
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__max_features': None,
 'gradientboostingclassifier__max_leaf_nodes': None,
 'gradientboostingclassifier__min_impurity_decrease': 0.0,
 'gradientboostingclassifier__min_impurity_split': None,
 'gradientboostingclassif

Let's declare the **hyperparameter grids** to tune.

In [9]:
# Logistic Regression hyperparameters
l1_hyperparameters = {
    'logisticregression__C' :  np.linspace(1e-3, 1e3, 10)
}

l2_hyperparameters = {
    'logisticregression__C' :  np.linspace(1e-3, 1e3, 10) 
}

**Declare the hyperparameter grid for the random forest.**

In [10]:
# Random Forest hyperparameters
rf_hyperparameters = {
                'randomforestclassifier__n_estimators': [100, 200],
                'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
                     }

**Declare the hyperparameter grid for the boosted tree.**

In [11]:
# Boosted Tree hyperparameters
gb_hyperparameters = {
  'gradientboostingclassifier__n_estimators': [100, 200],
   'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
    'gradientboostingclassifier__max_depth' : [1,3,5]
}

In [12]:
# Create hyperparameters dictionary
hyperparameters = {
    'l1': l1_hyperparameters,
    'l2': l2_hyperparameters,
    'rf': rf_hyperparameters,
    'gb': gb_hyperparameters
}

In [None]:
for key in ['l1', 'l2', 'rf', 'gb']:
    if key in hyperparameters:
        if type(hyperparameters[key]) is dict:
            print( key, 'was found in hyperparameters, and it is a grid.' )
        else:
            print( key, 'was found in hyperparameters, but it is not a grid.' )
    else:
        print( key, 'was not found in hyperparameters')

l1 was found in hyperparameters, and it is a grid.
l2 was found in hyperparameters, and it is a grid.
rf was found in hyperparameters, and it is a grid.
gb was found in hyperparameters, and it is a grid.


<span id="fit-tune"></span>
# Fit and tune models with cross-validation


In [None]:
# Create empty dictionary called fitted_models

fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
        model = GridSearchCV(pipeline, hyperparameters[name], cv = 10, n_jobs = -1)
    
    # Fit model on X_train, y_train
        model.fit(X_train, y_train)
    
    # Store model in fitted_models[name] 
        fitted_models[name] = model
    
    # Print '{name} has been fitted'
        print("{} has been fitted".format(name))

l1 has been fitted
l2 has been fitted
rf has been fitted


Let's check that the models are of the correct type.


In [None]:
# Check that we have 5 cross-validation objects
for key, value in fitted_models.items():
    print( key, type(value) )

Let's also make sure that the models have been fitted correctly.

In [None]:
from sklearn.exceptions import NotFittedError
for name, model in fitted_models.items():
    try:
        pred = model.predict(X_test)
        print(name, 'has been fitted.')
    except NotFittedError as e:
        print(repr(e))
       


<span id="evaluate"></span>
# Evaluate metrics




In [None]:
# Display best_score_ for each fitted model
for name, model in fitted_models.items():
    print(name, model.best_score_)

In [None]:
# Predict classes using L1-regularized logistic regression 
l1_pred = fitted_models['l1'].predict(X_test)

# Display first 5 predictions
l1_pred[:5]

In [None]:
# Display confusion matrix for y_test and pred
print(confusion_matrix(y_test, l1_pred))

In [None]:
# Predict PROBABILITIES using L1-regularized logistic regression
pred = fitted_models['l1'].predict_proba(X_test)

# Get just the prediction for the positive class (1)
pred = [p[1] for p in pred]

# Display first 5 predictions
pred[:10]

In [None]:
# Calculate ROC curve from y_test and pred
fpr, tpr, thresholds = roc_curve(y_test, pred)

In [None]:
# Store fpr, tpr, thresholds in DataFrame and display last 10
roc_df = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate' : tpr, 'Thresholds': thresholds})
roc_df.tail(10)




We can plot the entire curve.

In [None]:
# Initialize figure
sns.set_style('darkgrid')
fig = plt.figure(figsize=(8,8))
plt.title('Receiver Operating Characteristic for logistic regression (L1)', fontsize = 18)

# Plot ROC curve
plt.plot(fpr, tpr, label='l1')
plt.legend(loc='lower right')


# Diagonal 45 degree line
plt.plot([0,1],[0,1],'k--')

# Axes limits and labels
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate', fontsize = 16)
plt.xlabel('False Positive Rate', fontsize = 16)
plt.show()



In [None]:
# Calculate ROC curve for Logistic Regression (l1)
fpr, tpr, thresholds = roc_curve(y_test, pred)

# Calculate AUROC for Logistic Regression (l1)
print(auc(fpr, tpr))


In [None]:
# Create Empty Dictionaries for fpr, tpr, auc, and kappa
fpr_dic = {}
tpr_dic = {}
auc_dic = {}
kappa_dic = {}

# For loop to collect fpr, tpr and auc for each model and print out the auc
for name, model in fitted_models.items():
    pred_probs = model.predict_proba(X_test)
    pred_probs = [p[1] for p in pred_probs]
    pred_labels = model.predict(X_test)
    fpr_dic[name], tpr_dic[name], thresholds = roc_curve(y_test, pred_probs)
    auc_dic[name] = auc(fpr_dic[name], tpr_dic[name])
    kappa_dic[name] = cohen_kappa_score(y_test, pred_labels)
    print(name, auc_dic[name])
    

In [None]:
# Plot ROC curve
sns.set_style('darkgrid')
plt.figure(figsize=(8,8))
plt.plot(fpr_dic['l1'], tpr_dic['l1'],
         label='Logistic Regression (L1) (auc = %0.3f)' % auc_dic['l1'], color = 'darkorange')
plt.plot(fpr_dic['l2'], tpr_dic['l2'],
         label='Logistic Regression (L2) (auc = %0.3f)' % auc_dic['l2'], color = 'darkgreen')
plt.plot(fpr_dic['rf'], tpr_dic['rf'],
         label='Random Forest (auc = %0.3f)' % auc_dic['rf'], color = 'darkred')
plt.plot(fpr_dic['gb'], tpr_dic['gb'],
         label='Gradient Booster (auc = %0.3f)' % auc_dic['gb'], color = 'blue')

# Diagonal 45 degree line
plt.plot([0,1],[0,1],'k--')

# Axes limits and labels
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate', fontsize = 16)
plt.xlabel('False Positive Rate', fontsize = 16)
plt.title('ROC Curve', fontsize = 18)
plt.legend(loc = 'lower right')
plt.show()

In [None]:
# Kappa Scores and AUC Dataframe
kappa_list = list(kappa_dic.values())
auc_list = list(auc_dic.values())

# Formatting the lists
kappa_list = [ '%.3f' % elem for elem in kappa_list]
auc_list = [ '%.3f' % elem for elem in auc_list]

# Calculate kappa scores for each model
print(pd.DataFrame({'Kappa' : kappa_list,
                   'AUC' : auc_list,
                   'Model': ['Logistic Regression (L1)',
                              'Logistic Regression (L2)',
                              'Random Forest',
                              'Gradient Boosting']}))
    

**The best model seems to be Random Forest with an AUC of 0.991 and Kappa score of 0.943.**

In [None]:
# Let's get our winning model's parameters
fitted_models['rf'].best_params_

In [None]:
# Plot feature importance using Random Forest
rf_model = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None,
           max_features=0.33, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=123, verbose=0, warm_start=False)

## Fit the model on training data.
rf_model.fit(X_train, y_train) 
## And score it on testing data.
rf_model.score(X_test, y_test)
# Store it into a dataframe
imp_features = pd.DataFrame({'importance' : rf_model.feature_importances_ ,
                         'feature' : X_train.columns.values}).sort_values('importance', ascending = False)
# Display top 5 features                                                                       
imp_features

In [None]:
# Plot the feature importances of the forest

importances = rf_model.feature_importances_
# Sort feature importances in descending order
indices = np.argsort(importances)

# Rearrange feature names so they match the sorted feature importances
names = [imp_features.feature[i] for i in indices]

# Create plot
plt.figure(figsize = (12,8))

# Create plot title
plt.title("Feature Importance", fontsize = 18)

# Add bars
plt.barh(range(X_train.shape[1]), importances[indices])

# Add feature names as y-axis labels
plt.yticks(range(X_train.shape[1]), names, fontsize = 12)

# Show plot
plt.show()


In [None]:
# Save winning model as final_model.pkl
with open('final_model.pkl', 'wb') as f:
    pickle.dump(fitted_models['rf'].best_estimator_, f)