In [1]:
# import necessary Libraries
import numpy as np 
import pandas as pd 
from sklearn.feature_selection import RFE, VarianceThreshold,RFECV
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,auc,roc_curve,cohen_kappa_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer 
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [2]:
get_ipython().run_line_magic('matplotlib', 'inline')
pd.set_option('display.max_columns',30)

In [3]:
# Read in the training dataset
train_data = pd.read_csv("prudential_train.csv")
print(train_data.shape)
print(train_data.info())

(59381, 128)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


In [4]:
# Read in the test dataset
test_data = pd.read_csv("prudential_test.csv")
test_data.shape

(19765, 127)

In [5]:
# List of training features with missing data 
train_null_feat = train_data.columns[train_data.isnull().sum()> 0]
print(train_null_feat, "\n" "There are " + str(len(train_null_feat)) + " training features with missing data")

Index(['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6',
       'Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3',
       'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1',
       'Medical_History_10', 'Medical_History_15', 'Medical_History_24',
       'Medical_History_32'],
      dtype='object') 
There are 13 training features with missing data


In [6]:
# Training Features with >10% missing values from total observation
highnuldata = train_data.columns[train_data.isnull().sum()> 0.10*train_data.shape[0]]
print(highnuldata, "\n" + str(len(highnuldata)) + " training features have greater than 10% missing data")

Index(['Employment_Info_4', 'Employment_Info_6', 'Insurance_History_5',
       'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5',
       'Medical_History_1', 'Medical_History_10', 'Medical_History_15',
       'Medical_History_24', 'Medical_History_32'],
      dtype='object') 
12 training features have greater than 10% missing data


In [7]:
# List of test data features with missing data 
test_null_feat = test_data.columns[test_data.isnull().sum()> 0]
print(test_null_feat, "\n" "There are " + str(len(test_null_feat)) + " test features with missing data")

Index(['Employment_Info_1', 'Employment_Info_4', 'Employment_Info_6',
       'Insurance_History_5', 'Family_Hist_2', 'Family_Hist_3',
       'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1',
       'Medical_History_10', 'Medical_History_15', 'Medical_History_24',
       'Medical_History_32'],
      dtype='object') 
There are 13 test features with missing data


In [8]:
# Test Features with >10% missing values from total observation
highnulldata = test_data.columns[test_data.isnull().sum()> 0.10*test_data.shape[0]]
print(highnulldata,  "\n" + str(len(highnulldata)) + " test dataset features have greater than 10% missing data")

Index(['Employment_Info_4', 'Employment_Info_6', 'Insurance_History_5',
       'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5',
       'Medical_History_1', 'Medical_History_10', 'Medical_History_15',
       'Medical_History_24', 'Medical_History_32'],
      dtype='object') 
12 test dataset features have greater than 10% missing data


In [9]:
# drop features with missign values greater than 10% of total observations 
new_train_data= train_data.drop(train_data[highnuldata], axis=1)
new_test_data= test_data.drop(test_data[highnulldata], axis=1)

print(new_train_data.shape)
print(new_test_data.shape)

(59381, 116)
(19765, 115)


In [10]:
# we impute data into features with missing value <= 10%
new_train_data = new_train_data.fillna(new_train_data.mean())

In [11]:
new_test_data = new_test_data.fillna(new_test_data.mean())

In [12]:
# # Feature Engineering

In [13]:
# a function to add encoded features together
def add_cols(df):
    cols = df.columns
    for i in range(len(cols)):
        cols_total= df[cols].sum(axis=1)
    
    return cols_total

In [14]:
# creating a new medical keyword feature which is a sum of the total medical keyword for each instance
medicalkw = new_train_data.loc[ :, new_train_data.columns.str.startswith ("Medical_Keyword_")]
new_train_data['Medical_Keyword'] = add_cols(medicalkw)

In [15]:
medicalkw = new_test_data.loc[ :, new_test_data.columns.str.startswith ("Medical_Keyword_")]
new_test_data['Medical_Keyword'] = add_cols(medicalkw)

In [16]:
# creating a new medical history feature which is a sum of the total medical keyword for each instance
medicalHist= new_train_data.loc[ :, new_train_data.columns.str.startswith ("Medical_History_")]
new_train_data['Medical_History'] = add_cols(medicalHist)

In [17]:
medicalHist= new_test_data.loc[ :, new_test_data.columns.str.startswith ("Medical_History_")]
new_test_data['Medical_History'] = add_cols(medicalHist)

In [18]:
# A product of BMI and Ins_Age
new_train_data['BMI_InsAge']=new_train_data['BMI']*new_train_data['Ins_Age']

In [19]:
new_test_data['BMI_InsAge']=new_test_data['BMI']*new_test_data['Ins_Age']

In [20]:
# Drop some features to prevent redundancy 
new_train_data=new_train_data.drop(['BMI','Ins_Age',"Id",'Ht','Wt'],axis=1)

In [21]:
new_train_data=new_train_data.drop(medicalkw,axis=1)
new_train_data=new_train_data.drop(medicalHist,axis=1)

In [22]:
new_test_data=new_test_data.drop(['BMI','Ins_Age','Ht','Wt'],axis=1)

In [23]:
new_test_data=new_test_data.drop(medicalkw,axis=1)

In [24]:
new_test_data=new_test_data.drop(medicalHist,axis=1)

In [25]:
# Check for correlation in the training dataset
def get_correlated(data, threshold):
    cols = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if corrmat.iloc[i,j] > threshold:
                colname = corrmat.columns[i]
                cols.add(colname)
    return cols

In [26]:
# correlating features to drop 
to_drop = get_correlated(new_train_data, 0.90)
to_drop

{'Insurance_History_7', 'Insurance_History_9'}

In [27]:
new_train_data= new_train_data.drop(to_drop, axis=1)
new_train_data.shape

(59381, 28)

In [28]:
test_to_drop = get_correlated(new_test_data, 0.90)
test_to_drop

{'Insurance_History_7', 'Insurance_History_9'}

In [29]:
new_test_data= new_test_data.drop(test_to_drop, axis=1)

In [30]:
# make a copy of applicant id from test data
Id = new_test_data['Id'].copy()

In [31]:
# drop the Id as dropped in the training dataset
new_test_data  = new_test_data.drop('Id', axis = 1)

In [32]:
new_train_data.head()

Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_5,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_8,Family_Hist_1,Response,Medical_Keyword,Medical_History,BMI_InsAge
0,1,D3,10,0.076923,2,1,1,0.028,12,1,3,1,2,6,3,1,2,1,1,1,3,1,1,2,8,0,186,0.207304
1,1,A1,26,0.076923,2,3,1,0.0,1,3,2,1,2,6,3,1,2,1,2,1,3,1,3,2,4,0,482,0.016256
2,1,E1,26,0.076923,2,3,1,0.03,9,1,2,1,2,8,3,1,1,1,2,1,1,3,2,3,8,0,78,0.012799
3,1,D4,10,0.487179,2,3,1,0.042,9,1,3,2,2,8,3,1,2,1,2,1,1,3,2,3,8,1,425,0.057863
4,1,D2,26,0.230769,2,3,1,0.027,9,1,2,1,2,6,3,1,2,1,2,1,1,3,2,2,8,0,237,0.177213


In [33]:
X = new_train_data.drop(['Response'], axis=1)
y = new_train_data['Response']

In [34]:
# save column names from dataset 
X_cols=list(X.columns.copy())

# get the unique values in prod_info2 to serve as column name when one hot encoding is applied later
prod_info_cols = list(X['Product_Info_2'].unique())

# merge old column (cols) with new column name from the unique value of prod_info_2

pos = 1 # bcos we have Product_Info_1 in the 0 index.
# we insert prod_info_cols into X_cols 
for i in range(len(prod_info_cols)): 
    X_cols.insert(i + pos, prod_info_cols[i]) #insert method takes in the index position and the value to insert

# since we have new columns from one hot encodin of prod_info_2, we delete product_info_2 from the old column list
item = 'Product_Info_2'
index = X_cols.index(item)
del X_cols[index]

# lenght of the remaining column 
len(X_cols)

45

In [35]:
# Our Model won't understand alphanumeric input so We label encode product info 2 feature
# column_trans handles different preprocessing steps for different features,
# and pass through the remaining features that does not require any preprocessiong step

column_trans = make_column_transformer(
    (OneHotEncoder(), ['Product_Info_2']),
    remainder = StandardScaler())

In [36]:
X = column_trans.fit_transform(X)
print(X.shape)

(59381, 45)


In [37]:
new_test_data = column_trans.transform(new_test_data)

In [None]:
# Checking for target class distribution in the dataset
y['Response'].value_counts()

In [39]:
# Balancing our training dataset using resampling by over-sampling
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=27)
X, y = sm.fit_sample(X, y)
X.shape, y.shape                                                                 

((155912, 45), (155912,))

In [42]:
# count to ensure the class target is balanced 
from collections import Counter
print(sorted(Counter(y).items()))

[(1, 19489), (2, 19489), (3, 19489), (4, 19489), (5, 19489), (6, 19489), (7, 19489), (8, 19489)]


In [44]:
# make X a dataframe again and use column names saved earlier X_cols
X = pd.DataFrame(X,columns=X_cols)
X.head()

Unnamed: 0,Product_Info_1,D3,A1,E1,D4,D2,A8,A2,D1,A7,A6,A3,A5,C4,C1,...,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_8,Family_Hist_1,Medical_Keyword,Medical_History,BMI_InsAge
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,-1.420309,-0.854442,-0.789057,0.121993
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,0.862391,-1.013721,1.260049,-1.420309,-0.854442,0.868225,-1.591454
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.64942,-0.854442,-1.393741,-1.622455
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0865,0.809179,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.64942,-0.178868,0.549086,-1.218294
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,-1.420309,-0.854442,-0.503512,-0.14788


### Feature Selection

In [45]:
# Check for zero variance or degenerate variables and drop them
near_zero = VarianceThreshold(threshold = 0.10)
near_zero.fit_transform(X)
highvar = near_zero.get_support(indices=True)

print(X.columns[highvar], len(X.columns[highvar]))

Index(['C3', 'C2', 'B1', 'Product_Info_3', 'Product_Info_4', 'Product_Info_5',
       'Product_Info_6', 'Product_Info_7', 'Employment_Info_1',
       'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5',
       'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4',
       'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7',
       'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3',
       'Insurance_History_4', 'Insurance_History_8', 'Family_Hist_1',
       'Medical_Keyword', 'Medical_History', 'BMI_InsAge'],
      dtype='object') 28


In [46]:
X = X[X.columns[highvar]]
X

Unnamed: 0,C3,C2,B1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_5,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_8,Family_Hist_1,Medical_Keyword,Medical_History,BMI_InsAge
0,1.0,0.000000,-0.164525,-2.841731,-0.891949,-0.083689,-2.264385,-0.149284,-0.602215,0.794450,-0.420829,2.448480,-0.500858,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,-1.420309,-0.854442,-0.789057,0.121993
1,0.0,0.000000,-0.164525,0.312319,-0.891949,-0.083689,0.441621,-0.149284,-0.942298,-1.807840,2.376264,-0.408417,-0.500858,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,0.862391,-1.013721,1.260049,-1.420309,-0.854442,0.868225,-1.591454
2,0.0,0.000000,-0.164525,0.312319,-0.891949,-0.083689,0.441621,-0.149284,-0.577924,0.084735,-0.420829,-0.408417,-0.500858,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.649420,-0.854442,-1.393741,-1.622455
3,0.0,1.000000,-0.164525,-2.841731,0.559979,-0.083689,0.441621,-0.149284,-0.432174,0.084735,-0.420829,2.448480,1.891857,-0.0865,0.809179,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.649420,-0.178868,0.549086,-1.218294
4,0.0,0.000000,-0.164525,0.312319,-0.347476,-0.083689,0.441621,-0.149284,-0.614361,0.084735,-0.420829,-0.408417,-0.500858,-0.0865,0.061379,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,-1.420309,-0.854442,-0.503512,-0.147880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155907,0.0,0.739177,-0.164525,0.312319,0.323294,-0.083689,0.441621,6.701286,-0.251564,-0.964476,-0.420829,-0.408417,-0.500858,-0.0865,-1.060320,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,-0.737932,1.260049,0.109587,-0.355073,-0.586266,-0.727042
155908,1.0,0.000000,6.078086,0.312319,-0.052050,-0.083689,0.441621,-0.149284,-0.073029,0.270376,-0.420829,-0.408417,-0.125338,-0.0865,0.178741,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-0.847772,-1.388458,0.649420,0.496706,-0.394529,-0.552375
155909,0.0,0.000000,6.078086,0.312319,-0.382675,-0.083689,-2.264385,-0.149284,0.021766,-1.334696,-0.420829,-0.408417,-0.500858,-0.0865,-0.502300,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,-1.013721,1.260049,0.031634,-0.380518,-0.580896,0.860191
155910,1.0,0.000000,-0.164525,0.312319,2.374890,-0.083689,-2.264385,-0.149284,0.332777,0.741369,-0.420829,-0.408417,-0.500858,-0.0865,1.431862,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,0.649420,-0.553896,1.300555,0.169337


In [47]:
# make new_test_data dataframe again and use column names saved earlier X_cols
new_test_data = pd.DataFrame(new_test_data,columns=X_cols)
new_test_data= new_test_data[new_test_data.columns[highvar]]
new_test_data

Unnamed: 0,C3,C2,B1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_5,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_8,Family_Hist_1,Medical_Keyword,Medical_History,BMI_InsAge
0,1.0,0.0,-0.164525,0.312319,0.559979,-0.083689,0.441621,-0.149284,0.879575,-1.334696,-0.420829,-0.408417,1.891857,-0.0865,1.930878,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.649420,1.172279,-1.354549,0.854665
1,0.0,0.0,-0.164525,0.312319,-0.891949,-0.083689,0.441621,-0.149284,-0.942298,-1.807840,2.376264,-0.408417,-0.500858,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,-1.420309,-0.854442,0.056380,0.989410
2,1.0,0.0,-0.164525,0.312319,-0.652199,-0.083689,0.441621,-0.149284,0.794554,0.084735,-0.420829,-0.408417,-0.500858,-0.0865,-1.060320,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,0.649420,1.172279,-0.688277,0.972770
3,0.0,0.0,-0.164525,0.312319,-0.627277,-0.083689,-2.264385,-0.149284,1.608324,0.084735,-0.420829,-0.408417,1.891857,-0.0865,-1.060320,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,-1.420309,1.172279,-0.497913,0.544234
4,0.0,0.0,-0.164525,0.312319,-0.891949,-0.083689,0.441621,-0.149284,0.090096,0.084735,-0.420829,-0.408417,-0.500858,-0.0865,0.809179,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,-1.159587,1.101046,-0.064204,-1.420309,-0.178868,-0.413929,-0.589476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19760,1.0,0.0,-0.164525,0.312319,-0.710458,-0.083689,0.441621,-0.149284,-0.152820,0.084735,-0.420829,-0.408417,1.891857,-0.0865,0.809179,0.362834,-0.117377,-0.832218,-0.140157,0.611857,-0.169414,0.862391,-1.013721,1.260049,0.649420,-0.854442,-1.326554,-0.364178
19761,0.0,1.0,-0.164525,-2.841731,-0.347476,-0.083689,0.441621,-0.149284,-0.760111,1.267594,-0.420829,2.448480,-0.500858,-0.0865,0.809179,0.362834,-0.117377,1.201609,-0.140157,0.611857,-0.169414,0.862391,0.043662,1.260049,-1.420309,-0.854442,-1.315356,-1.199495
19762,0.0,0.0,-0.164525,0.312319,-0.891949,-0.083689,0.441621,-0.149284,-0.322861,-1.807840,2.376264,-0.408417,-0.500858,-0.0865,0.809179,-2.756080,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,-1.388458,-1.420309,1.172279,-1.332153,3.267877
19763,0.0,0.0,-0.164525,0.312319,0.559979,-0.083689,0.441621,-0.149284,-0.152820,1.267594,-0.420829,-0.408417,-0.500858,-0.0865,-0.686420,0.362834,-0.117377,-0.832218,-0.140157,-1.634368,-0.169414,0.862391,-1.013721,1.260049,0.649420,-0.854442,-0.509111,2.098461


### Cross Validation

In [48]:
# Cross Validation using Random Forest Classifier
rfc = RandomForestClassifier()
rfc_scores=cross_val_score(rfc, X,y, cv=10, scoring='accuracy')
print(rfc_scores)
print(rfc_scores.mean())

[0.59831965 0.64539507 0.66621769 0.71663139 0.77365147 0.79577962
 0.80783786 0.81412353 0.80431018 0.81258418]
0.7434850637244212


In [49]:
# Cross Validation using K Neighbors CLassifier
knn = KNeighborsClassifier()
knn_scores = cross_val_score(knn, X,y, cv=10, scoring='accuracy')
print(knn_scores)
print(knn_scores.mean())

[0.63282453 0.63545408 0.63472516 0.67032262 0.71066641 0.72477712
 0.74472452 0.74299275 0.74645629 0.74934257]
0.6992286044602372


In [50]:
# Cross Validation using XGBoost CLassifier
xgb = XGBClassifier()
xgb_scores=cross_val_score(xgb, X, y, cv=10, scoring='accuracy')
print(xgb_scores)
print(xgb_scores.mean())

[0.29117496 0.34049513 0.3532166  0.36052851 0.37053428 0.37893657
 0.36373549 0.36957219 0.36630107 0.36803284]
0.3562527633244358


In [52]:
# Cross Validation using Logistic Regression CLassifier
lr = LogisticRegression()
lr_scores=cross_val_score(lr, X, y, cv=10, scoring='accuracy')
print(lr_scores)
print(lr_scores.mean())

[0.26686762 0.26282709 0.27868642 0.27766019 0.27547944 0.28048233
 0.27676223 0.27977679 0.27445321 0.28214996]
0.2755145295701671


## Hyperparameter Tuning

In [59]:
param_dist = dict(n_estimators = [100,200,300,400])
rfc_random = RandomizedSearchCV(rfc, param_dist,cv=10, scoring='accuracy',n_iter=10,random_state=5, n_jobs=-1,verbose=1) 
rfc_random.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 43.0min finished


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'n_estimators': [100, 200, 300, 400]},
                   random_state=5, scoring='accuracy', verbose=1)

In [61]:
print('Random Forest Classifier Best Accuracy: ',rfc_random.best_score_)
print('Random Forest Classifier Best tuned parameter: ',rfc_random.best_params_)
print('Random Forest Classifier Best tuned parameters: ',rfc_random.best_estimator_)

Random Forest Classifier Best Accuracy:  0.753061024668591
Random Forest Classifier Best tuned parameter:  {'n_estimators': 400}
Random Forest Classifier Best tuned parameters:  RandomForestClassifier(n_estimators=400)


In [70]:
# We split the training dataset to get some sample for evaluating model performance
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.20, random_state=42)

In [72]:
# Predicitng some samples from dataset it was trained with
eval_pred = rfc_random.predict(X_eval)

In [73]:
# The result from predicted samples and how they compare to the actual
eval_result = pd.DataFrame({'y_train_actual': y_eval,'y_train_predicted':eval_pred})
eval_result.head()

Unnamed: 0,y_train_actual,y_train_predicted
0,3,3
1,4,4
2,5,5
3,6,6
4,2,2


In [74]:
# Predicting Response for the test dataset
test_pred = rfc_random.predict(new_test_data)

In [75]:
# Applicant Id that was copied earlier is now used to track response 
test_data_result = pd.DataFrame({'Applicant Id':Id ,'Predicted Response':test_pred})
test_data_result.head()

Unnamed: 0,Applicant Id,Predicted Response
0,1,1
1,3,5
2,4,1
3,9,2
4,12,8


In [76]:
# Count of test_data instances to Predicted Response
test_data_result['Predicted Response'].value_counts()

8    8499
6    3857
7    1989
1    1936
2    1777
5    1229
4     298
3     180
Name: Predicted Response, dtype: int64

In [77]:
# save the trained model into disk using pickle
pickle.dump(rfc_random, open('plimodel.pkl','wb'))


In [78]:
# Loading model to compare the results
model = pickle.load(open('plimodel.pkl','rb'))

In [80]:
#make a prediction
print('Predicted Class:', model.predict([[0,0,1,26,0.23,2,3,1,0.027,9,1,2,1,2,6,3,1,2,1,2,1,1,3,2,2,12,156,0.34]])[0])

Predicted Class: 1
