# Classification model - Saving Pickle File

In [1]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [2]:
data = pd.read_csv('ppp data.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (417253, 10)


Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,DateApproved,Lender,CD,LoanRange
0,133500,Pasadena,TX,77502,111110.0,Subchapter S Corporation,4/29/2020,"Capital One, National Association",TX-29,6
1,125300,Houston,TX,77041,111110.0,Corporation,4/29/2020,Comerica Bank,TX-07,6
2,124938,PLANO,TX,75024,111110.0,Corporation,4/14/2020,Veritex Community Bank,TX-03,5
3,118922,Sugar Land,TX,77478,111110.0,Limited Liability Company(LLC),4/27/2020,Allegiance Bank,TX-22,5
4,109952,Houston,TX,77007,111110.0,Corporation,4/28/2020,Allegiance Bank,TX-18,5


### Check how many `null/Nan` values are in each column

In [3]:
data.isnull().sum()

LoanAmount         0
City              14
State              0
Zip                0
NAICSCode       9162
BusinessType     258
DateApproved       0
Lender             0
CD                18
LoanRange          0
dtype: int64

### View the possible labels for the columns that have some null values

In [4]:
# # we are intentionally narrowing the scope of the loan data details for those with strong completion entries, minimal nulls
# contains_null = ['Zip', 'BusinessType', 'DateApproved', 'Lender','CD', 'LoanRange']

# for col in contains_null:
#     print(f"List of unique labels for {col}:::{set(data[col])}")

### Clean up null values

For the sake of simplicity, I'm going to make a few assumptions:

- `Dependents`: Assumption that there are no dependents (0: 345 | 1: 102 | 2: 101 | 3+: 51)
- `Self_Employed`: Assumption that the applicant is not self-employed (No: 500 | Yes: 82)
- `Credit_History`: Assumption that the person has a credit history (True: 475 | False: 89)
- `Married`: If nothing specified, applicant is not married
- `Gender`: Assuming the gender is Male for the missing values (Male: 489 | Female: 112)

In [5]:
# data['Dependents'] = data['Dependents'].fillna('0')
# data['Self_Employed'] = data['Self_Employed'].fillna('No')
# data['Credit_History'] = data['Credit_History'].fillna(1)
# data['Married'] = data['Married'].fillna('No')
# data['Gender'] = data['Gender'].fillna('Male')

In [6]:
# investigate rows with null values
data[data.isnull().any(axis=1)]

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,DateApproved,Lender,CD,LoanRange
2863,37053,WALLIS,TX,77485,111998.0,,5/3/2020,"Wells Fargo Bank, National Association",TX-10,2
5759,21396,FLORESVILLE,TX,78114,112990.0,,5/3/2020,"Wells Fargo Bank, National Association",TX-28,1
16293,15102,ROSENBERG,TX,77471,236115.0,,5/3/2020,"Wells Fargo Bank, National Association",TX-22,1
28793,45001,DALLAS,TX,75228,238160.0,,5/1/2020,"JPMorgan Chase Bank, National Association",TX-05,2
30297,136475,KENNEDALE,TX,76060,238210.0,,5/1/2020,"JPMorgan Chase Bank, National Association",TX-06,6
...,...,...,...,...,...,...,...,...,...,...
416999,"e $150,000-350,000",Berkeley,TX,75081,,Limited Liability Company(LLC),5/1/2020,"JPMorgan Chase Bank, National Association",TX-32,7
417101,"e $150,000-350,000",RIVERSIDE,TX,75247,,Corporation,5/1/2020,"JPMorgan Chase Bank, National Association",TX-30,7
417135,"e $150,000-350,000",REEDLEY,TX,78753,,Corporation,5/1/2020,"JPMorgan Chase Bank, National Association",TX-10,7
417154,"e $150,000-350,000",Yucaipa,TX,77598,,Corporation,5/1/2020,"JPMorgan Chase Bank, National Association",TX-22,7


In [7]:
# only take rows of businesses that CD is not null

data = data[data['CD'].notna()]
data.head()

#drop rows with NA cities
data = data[data['City'].notna()]

# drop rows with NA NAICS code
data = data[data['NAICSCode'].notna()]

#drop rows with NA business type
data = data[data['BusinessType'].notna()]

data.isnull().sum()

data.shape

data.head()

data['NAICSCode'] = data['NAICSCode'].astype(int)

data

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,DateApproved,Lender,CD,LoanRange
0,133500,Pasadena,TX,77502,111110,Subchapter S Corporation,4/29/2020,"Capital One, National Association",TX-29,6
1,125300,Houston,TX,77041,111110,Corporation,4/29/2020,Comerica Bank,TX-07,6
2,124938,PLANO,TX,75024,111110,Corporation,4/14/2020,Veritex Community Bank,TX-03,5
3,118922,Sugar Land,TX,77478,111110,Limited Liability Company(LLC),4/27/2020,Allegiance Bank,TX-22,5
4,109952,Houston,TX,77007,111110,Corporation,4/28/2020,Allegiance Bank,TX-18,5
...,...,...,...,...,...,...,...,...,...,...
417248,"e $150,000-350,000",Lafayette,TX,77064,541611,Limited Liability Company(LLC),5/1/2020,"JPMorgan Chase Bank, National Association",TX-07,7
417249,"e $150,000-350,000",SANTA ROSA,TX,78205,531312,Corporation,4/14/2020,Vantage Bank Texas,TX-35,7
417250,"e $150,000-350,000",RANCHO CORDOVA,TX,77803,442210,Limited Liability Company(LLC),4/27/2020,"Spirit of Texas Bank, SSB",TX-17,7
417251,"e $150,000-350,000",LOS ALTOS,TX,78230,492210,Corporation,4/30/2020,Randolph-Brooks FCU,TX-21,7


### View cleaned up values

In [8]:
# label_cols = ['LoanAmount', 'City', 'State', 'Zip', 'NAICSCode', 'BusinessType', 'DateApproved', 'Lender', 'CD', 'LoanRange']

# for col in label_cols:
#     print(f"List of unique labels for {col} ::: {set(data[col])}")

In [9]:
## read in data for NAICS code

naics = pd.read_csv('US-Business-Profiles-By-Sales-and-Employees (1).csv')

naics.head()


Unnamed: 0,NAICS 1 Code,NAICS 1 Description
0,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming
4,111150,Corn Farming


In [10]:
naics.dtypes
naics[naics.isnull().any(axis=1)]

data2 = naics.drop([1045])
data2['NAICS 1 Code'] = data2['NAICS 1 Code'].astype(int)

data2

Unnamed: 0,NAICS 1 Code,NAICS 1 Description
0,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming
4,111150,Corn Farming
...,...,...
1040,926140,Regulation of Agricultural Marketing and Commo...
1041,926150,"Regulation, Licensing, and Inspection of Misce..."
1042,927110,Space Research and Technology
1043,928110,National Security


In [11]:
## merge tables 

data3 = pd.merge(data, data2, left_on='NAICSCode', right_on='NAICS 1 Code') 

data3

Unnamed: 0,LoanAmount,City,State,Zip,NAICSCode,BusinessType,DateApproved,Lender,CD,LoanRange,NAICS 1 Code,NAICS 1 Description
0,133500,Pasadena,TX,77502,111110,Subchapter S Corporation,4/29/2020,"Capital One, National Association",TX-29,6,111110,Soybean Farming
1,125300,Houston,TX,77041,111110,Corporation,4/29/2020,Comerica Bank,TX-07,6,111110,Soybean Farming
2,124938,PLANO,TX,75024,111110,Corporation,4/14/2020,Veritex Community Bank,TX-03,5,111110,Soybean Farming
3,118922,Sugar Land,TX,77478,111110,Limited Liability Company(LLC),4/27/2020,Allegiance Bank,TX-22,5,111110,Soybean Farming
4,109952,Houston,TX,77007,111110,Corporation,4/28/2020,Allegiance Bank,TX-18,5,111110,Soybean Farming
...,...,...,...,...,...,...,...,...,...,...,...,...
397943,"d $350,000-1 million",PHOENIX,TX,79072,325193,Limited Liability Company(LLC),4/15/2020,"JPMorgan Chase Bank, National Association",TX-19,8,325193,Ethyl Alcohol Manufacturing
397944,"d $350,000-1 million",TUCSON,TX,79045,325193,Limited Liability Company(LLC),4/15/2020,"JPMorgan Chase Bank, National Association",TX-19,8,325193,Ethyl Alcohol Manufacturing
397945,"d $350,000-1 million",Scottsdale,TX,75034,325193,Limited Liability Company(LLC),4/10/2020,"JPMorgan Chase Bank, National Association",TX-26,8,325193,Ethyl Alcohol Manufacturing
397946,"d $350,000-1 million",YUMA,TX,77002,212113,Limited Liability Company(LLC),4/9/2020,Gulf Capital Bank,TX-18,8,212113,Anthracite Mining


### Encode categorical fields
We have a lot of `string` labels that we encounter in `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [12]:
# create dictionaries to map fields to numeric values
# business_type = {'Limited  Liability Company(LLC)': 0, 'Limited Liability Partnership' : 1, 'Corporation' : 2, 
#                  'Subchapter S Corporation': 3, 'Cooperative': 4, 'Employee Stock Ownership Plan(ESOP)': 5, 
#                  'Independent Contractors': 6, 'Joint Venture': 7, 'Non-Profit Childcare Center': 8, 
#                  'Non-Profit Organization': 9, 'Partnership': 10, 'Professional Association': 11, 
#                  'Rollover as Business Space Start-Ups (ROB': 12, 'Self-Employed Individuals': 13, 
#                  'Sole Proprietorship': 14, 'Tenant in Common': 15, 'Trust': 16 } 


# replace values in each column according to the dictionaries above
# data2.replace({'BusinessType': business_type}).astype(int)
# # Get dummy variables for nominal property column
clean_data = pd.get_dummies(data3, columns=["BusinessType"])

clean_data.head()
clean_data.keys()

Index(['LoanAmount', 'City', 'State', 'Zip', 'NAICSCode', 'DateApproved',
       'Lender', 'CD', 'LoanRange', 'NAICS 1 Code', 'NAICS 1 Description',
       'BusinessType_Cooperative', 'BusinessType_Corporation',
       'BusinessType_Employee Stock Ownership Plan(ESOP)',
       'BusinessType_Independent Contractors', 'BusinessType_Joint Venture',
       'BusinessType_Limited  Liability Company(LLC)',
       'BusinessType_Limited Liability Partnership',
       'BusinessType_Non-Profit Childcare Center',
       'BusinessType_Non-Profit Organization', 'BusinessType_Partnership',
       'BusinessType_Professional Association',
       'BusinessType_Rollover as Business Start-Ups (ROB',
       'BusinessType_Self-Employed Individuals',
       'BusinessType_Sole Proprietorship',
       'BusinessType_Subchapter S Corporation',
       'BusinessType_Tenant in Common', 'BusinessType_Trust'],
      dtype='object')

### Train/Test Split

In [13]:
# store the target variable in y and everything else goes in X
y = clean_data['Lender']

# we're also dropping Load_ID because it adds no value to the prediction
X = clean_data.drop(['LoanAmount', 'City', 'DateApproved', 'CD', 'State', 'Lender', 'NAICS 1 Description'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

X_train.shape


(298461, 21)

### Fill the nulls for the continuous variables with the mean values of that column

In [14]:
# X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
# X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())
# X_test['Loan_Amount_Term'] = X_test['Loan_Amount_Term'].fillna(X_test['Loan_Amount_Term'].mean())
# X_test['LoanAmount'] = X_test['LoanAmount'].fillna(X_test['LoanAmount'].mean())


# # View the datatypes of all columns
# X_train.dtypes

### Confirm that we no longer have any nulls

In [15]:
X_train.isnull().sum()

Zip                                                 0
NAICSCode                                           0
LoanRange                                           0
NAICS 1 Code                                        0
BusinessType_Cooperative                            0
BusinessType_Corporation                            0
BusinessType_Employee Stock Ownership Plan(ESOP)    0
BusinessType_Independent Contractors                0
BusinessType_Joint Venture                          0
BusinessType_Limited  Liability Company(LLC)        0
BusinessType_Limited Liability Partnership          0
BusinessType_Non-Profit Childcare Center            0
BusinessType_Non-Profit Organization                0
BusinessType_Partnership                            0
BusinessType_Professional Association               0
BusinessType_Rollover as Business Start-Ups (ROB    0
BusinessType_Self-Employed Individuals              0
BusinessType_Sole Proprietorship                    0
BusinessType_Subchapter S Co

### Create the classifier model and the parameter grid for GridSearch

In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20], 
             "min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

### Initialize the GridSearch to tune my hyperparameters

In [17]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute the tuning

In [18]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 256 candidates, totalling 768 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=  45.4s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.4s remaining:    0.0s


[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=  45.4s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=  46.6s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total= 1.6min
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total= 1.6min
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total= 1.6min
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50, total= 4.5m

[Parallel(n_jobs=1)]: Done 768 out of 768 | elapsed: 683.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### View the results

In [19]:
print(f"Best parameters: {grid.best_params_}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'max_depth': 10, 'max_leaf_nodes': None, 'min_impurity_split': 0.4, 'n_estimators': 100}
Test set score: 0.1632776141606441


# Saving the model

In [20]:
import pickle

with open('RF_model.pkl','wb') as f:
    pickle.dump(grid, f)

### Loading the saved model

In [21]:
with open('RF_model.pkl', 'rb') as f:
    model = pickle.load(f)

### Test the saved model

In [22]:
print(f"Test set score: {model.score(X_test, y_test)}")

Test set score: 0.1632776141606441


In [23]:
model

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando