# Classification model - Saving Pickle File

In [9]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [10]:
data = pd.read_csv('ppp data.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (417266, 15)


Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,RaceEthnicity,Gender,Veteran,NonProfit,JobsReported,DateApproved,Lender,CD,LoanRange
0,133500,Pasadena,TX,77502.0,111110.0,Subchapter S Corporation,Unanswered,Unanswered,Unanswered,,0.0,4/29/2020,"Capital One, National Association",TX-29,6
1,125300,Houston,TX,77041.0,111110.0,Corporation,Unanswered,Unanswered,Unanswered,,0.0,4/29/2020,Comerica Bank,TX-07,6
2,124938,PLANO,TX,75024.0,111110.0,Corporation,Unanswered,Unanswered,Unanswered,,9.0,4/14/2020,Veritex Community Bank,TX-03,5
3,118922,Sugar Land,TX,77478.0,111110.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,15.0,4/27/2020,Allegiance Bank,TX-22,5
4,109952,Houston,TX,77007.0,111110.0,Corporation,Unanswered,Unanswered,Unanswered,,17.0,4/28/2020,Allegiance Bank,TX-18,5


### Check how many `null/Nan` values are in each column

In [17]:
data.isnull().sum()

LoanAmount            0
City                 24
State                 0
Zip                  13
NAICSCode          9166
BusinessType        258
RaceEthnicity         0
Gender                0
Veteran               0
NonProfit        405255
JobsReported      19291
DateApproved          0
Lender                0
CD                   31
LoanRange             0
dtype: int64

### View the possible labels for the columns that have some null values

In [14]:
# we are intentionally narrowing the scope of the loan data details for those with strong completion entries, minimal nulls
contains_null = ['Zip', 'BusinessType', 'DateApproved', 'Lender','CD', 'LoanRange']

for col in contains_null:
    print(f"List of unique labels for {col}:::{set(data[col])}")

List of unique labels for Zip:::{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 73960.0, 74024.0, 74494.0, 75001.0, 75002.0, 75006.0, 75007.0, 75009.0, 75010.0, 75011.0, 75013.0, 75014.0, 75015.0, 75016.0, 75017.0, 75019.0, 75020.0, 75021.0, 75022.0, 75023.0, 75024.0, 75025.0, 75026.0, 75027.0, 75028.0, 75029.0, 75030.0, 75032.0, 75033.0, 75034.0, 75035.0, 75036.0, 75038.0, 75039.0, 75040.0, 75041.0, 75042.0, 75043.0, 75044.0, 75045.0, 75046.0, 75048.0, 75049.0, 75050.0, 75051.0, 75052.0, 75053.0, 75054.0, 75056.0, 75057.0, 75058.0, 75059.0, 75060.0, 75061.0, 75062.0, 75063.0, 75065.0, 75067.0, 75068.0, 75069.0, 75070.0, 75071.0, 75072.0, 34113.0, 75074.0, 75075.0, 75076.0, 75077.0, 75078.0, 75080.0, 75081.0, 75082.0, 75083.0, 75085.0, 75086.0, 75087.0, 75088.0, 75089.0, 75090.0, 75091.0, 75092.0, 75093.0, 75094.0, 75097.0, 75098.0, 75099.0, 75101.0, 75102.0, 75103.0, 75104.0, 75106.0, 75109.0, 75110.0, 75114.0, 75115.0, 75116.0, 75117.0, 75118.0, 75119.0, 75120.0, 75

### Clean up null values

For the sake of simplicity, I'm going to make a few assumptions:

- `Dependents`: Assumption that there are no dependents (0: 345 | 1: 102 | 2: 101 | 3+: 51)
- `Self_Employed`: Assumption that the applicant is not self-employed (No: 500 | Yes: 82)
- `Credit_History`: Assumption that the person has a credit history (True: 475 | False: 89)
- `Married`: If nothing specified, applicant is not married
- `Gender`: Assuming the gender is Male for the missing values (Male: 489 | Female: 112)

In [None]:
data['Dependents'] = data['Dependents'].fillna('0')
data['Self_Employed'] = data['Self_Employed'].fillna('No')
data['Credit_History'] = data['Credit_History'].fillna(1)
data['Married'] = data['Married'].fillna('No')
data['Gender'] = data['Gender'].fillna('Male')

### View cleaned up values

In [None]:
label_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Education', 'Property_Area', 'Loan_Status']

for col in label_cols:
    print(f"List of unique labels for {col} ::: {set(data[col])}")

### Encode categorical fields
We have a lot of `string` labels that we encounter in `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [None]:
# create dictionaries to map fields to numeric values
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
target_values = {'Y':1, 'N':0}

# replace values in each column according to the dictionaries above
data.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                'Self_Employed': employed_values, 'Dependents': dependent_values, 'Loan_Status': target_values}, inplace=True)

# Get dummy variables for nominal property column
clean_data = pd.get_dummies(data, columns=["Property_Area"])

clean_data.head()

### Train/Test Split

In [None]:
# store the target variable in y and everything else goes in X
y = clean_data['Loan_Status']

# we're also dropping Load_ID because it adds no value to the prediction
X = clean_data.drop(['Loan_ID', 'Loan_Status'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

X_train.shape

### Fill the nulls for the continuous variables with the mean values of that column

In [None]:
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())
X_test['Loan_Amount_Term'] = X_test['Loan_Amount_Term'].fillna(X_test['Loan_Amount_Term'].mean())
X_test['LoanAmount'] = X_test['LoanAmount'].fillna(X_test['LoanAmount'].mean())


# View the datatypes of all columns
X_train.dtypes

### Confirm that we no longer have any nulls

In [None]:
X_train.isnull().sum()

### Create the classifier model and the parameter grid for GridSearch

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20], 
             "min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

### Initialize the GridSearch to tune my hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute the tuning

In [None]:
grid.fit(X_train, y_train)

### View the results

In [None]:
print(f"Best parameters: {grid.best_params_}")
print(f"Test set score: {grid.score(X_test, y_test)}")

# Saving the model

In [None]:
import pickle

with open('RF_model.pkl','wb') as f:
    pickle.dump(grid, f)

### Loading the saved model

In [None]:
with open('RF_model.pkl', 'rb') as f:
    model = pickle.load(f)

### Test the saved model

In [None]:
print(f"Test set score: {model.score(X_test, y_test)}")

In [None]:
model