# Notebook to create a simple ML model and save as a file using the pickle package

### Import needed packages

In [21]:
import pandas as pd
import numpy as np

# import logistic regression classifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

# import grid search and cross-validation
from sklearn.model_selection import GridSearchCV

# import auc metric
from sklearn.metrics import roc_auc_score



### Read in the data

In [22]:
lend_df = pd.read_csv('LendingClub_DataSet.csv')
lend_df.head(10)

Unnamed: 0,home_ownership,income,dti,fico,loan_status
0,1,44304.0,18.47,690,0
1,0,50000.0,29.62,735,1
2,0,64400.0,16.68,675,1
3,0,38500.0,33.73,660,0
4,1,118000.0,26.66,665,1
5,1,43000.0,20.68,725,1
6,0,76000.0,17.31,685,1
7,0,75000.0,22.34,700,1
8,1,156000.0,13.28,735,1
9,1,54000.0,19.0,660,0


In [23]:
# Separate into predictors (X) and a SINGLE TARGET outcome (Y)
X = lend_df.drop('loan_status', axis=1)
y = lend_df.loan_status

In [24]:
# Create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.8)

### Use GridSearchCV to find a decent Logistic Regression Classifier for parameter search and with 5 folds

In [25]:
# Use GridSearchCV to find the best parameters for logistidc regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

# Use GridSearchCV to find the best parameters for Logistic regression
gridSearch_auc = GridSearchCV(estimator=LogisticRegression(random_state=1), param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=1, verbose=1)
gridSearch_auc.fit(X_train, y_train)
print('Initial score: {:.2f}'.format(gridSearch_auc.best_score_))
print('Best parameters from param_grid: {}'.format(gridSearch_auc.best_params_))

# Fit the model with the best parameters
BestLogReg = gridSearch_auc.best_estimator_

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Initial score: 0.65
Best parameters from param_grid: {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}


### Model isn't great, but we're just building a simple model to illustrate the principle. We'll still check its performance on hold out test data

In [26]:
# Check the auc using the test data
test_pred_prob = BestLogReg.predict_proba(X_test)[:,1]
test_auc = roc_auc_score(y_test, test_pred_prob)
print('Test AUC: {:.2f}'.format(test_auc))

Test AUC: 0.62


### Here's the cool stuff. Save the model you created as a file. Now you can apply it elsewhere without having to rebuild it or share the code and training data. 
### `lend_logistic_model.pkl` file is now in the folder where this file exists is now your model that you can share and open elsewhere

In [None]:
# Write a file of binary model ('wb') (=dump) to a file using pickle
import pickle
filename = 'lend_logistic_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(BestLogReg, file)

### This code could be copied and (with the proper imports) to open your model file and apply it to any other data!

In [37]:
# Open the file and load the model
file_to_load = 'lend_logistic_model.pkl'
with open(file_to_load, 'rb') as file:
    loaded_model = pickle.load(file)

# Create a new customer
ownhome = 1  # 1 for own home, 0 for rent
income = 80000  # annual income
dti = 0.2  # debt-to-income ratio
fico = 700  # FICO score

# Create a dataframe for the new customer
# Make sure the ORDER of the columns is the same as in the training data
# Note: The column names should match the training data columns
new_customer = pd.DataFrame({
    'home_ownership': [ownhome],
    'income': [income],
    'dti': [dti],
    'fico': [fico]
})

# Print the predicted probability of default
predicted_prob = loaded_model.predict_proba(new_customer)[:, 0]
print('Predicted probability of default: {:.2f}'.format(predicted_prob[0]))
# Print the predicted class
predicted_class = loaded_model.predict(new_customer)
# print 'Predicted to Default' if predicted_class[0] == 1 else 'Predicted to not Default'
if predicted_class[0] == 0:
    print('Predicted class: Default')
else:
    print('Predicted class: Not Default')

Predicted probability of default: 0.08
Predicted class: Not Default
