In [1]:
#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
import pickle

In [2]:
import os
os.chdir(r"") 

df = pd.read_csv('UniversalBank.csv')

In [3]:
# We will find the mean of positive experience values for above ages and use it to replace

# having -1 exp
a = df[df['Experience'] == -1]['Age'].value_counts().index.tolist()
x = df[df['Experience'] == -1]['Experience'].index.tolist()
for i in x:
    df.loc[i,'Experience'] = df[(df['Age'].isin(a)) & (df.Experience > 0)].Experience.mean()
    
    
# having -2 exp
b = df[df['Experience'] == -2]['Age'].value_counts().index.tolist()
y = df[df['Experience'] == -2]['Experience'].index.tolist()
for i in y:
    df.loc[i,'Experience'] = df[(df['Age'].isin(b)) & (df.Experience > 0)].Experience.mean()
    
    
# having -3 exp
c = df[df['Experience'] == -3]['Age'].value_counts().index.tolist()
z = df[df['Experience'] == -3]['Experience'].index.tolist()
for i in z:
    df.loc[i,'Experience'] = df[(df['Age'].isin(c)) & (df.Experience > 0)].Experience.mean()

In [4]:
df.drop(columns=['ZIP Code','ID'], inplace=True)


In [5]:
#rearranging columns

df = df.loc[:,['Age', 'Experience', 'Income', 'Education', 'Family', 'CreditCard', 'CCAvg', 'Online',
       'Mortgage', 'Securities Account', 'CD Account','Personal Loan']]
df.head()

Unnamed: 0,Age,Experience,Income,Education,Family,CreditCard,CCAvg,Online,Mortgage,Securities Account,CD Account,Personal Loan
0,25,1.0,49,1,4,0,1.6,0,0,1,0,0
1,45,19.0,34,1,3,0,1.5,0,0,1,0,0
2,39,15.0,11,1,1,0,1.0,0,0,0,0,0
3,35,9.0,100,2,1,0,2.7,0,0,0,0,0
4,35,8.0,45,2,4,1,1.0,0,0,0,0,0


In [6]:
#We will use experience column and not use age column as both are highly correlated

x = df.iloc[:,1:11]
y = df.iloc[:,-1]

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
# parameter grid
param_grid = {"learning_rate": [0.3, 0.6, 0.9],
              "subsample": [0.3, 0.6, 0.9],
              "max_depth": [3,6,9],
              "max_features": [3,6,9],
              "min_samples_leaf": range(1, 5),
              "min_samples_split": [3,6,9],
              "random_state": [5]
             }


GBC = GradientBoostingClassifier(max_depth=2, n_estimators=200)

In [8]:
# run grid search
folds = 10
grid_search_GBC = GridSearchCV(GBC, 
                               cv = folds,
                               param_grid=param_grid, 
                               return_train_score=True,                         
                               verbose = 1,
                               scoring = 'recall',
                               n_jobs= -1)



In [9]:
grid_search_GBC.fit(x,y)

Fitting 10 folds for each of 972 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed: 23.3min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed: 30.7min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 36.7min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done 9720 out of 9720 | elapsed: 53.0min finished


GridSearchCV(cv=10,
             estimator=GradientBoostingClassifier(max_depth=2,
                                                  n_estimators=200),
             n_jobs=-1,
             param_grid={'learning_rate': [0.3, 0.6, 0.9],
                         'max_depth': [3, 6, 9], 'max_features': [3, 6, 9],
                         'min_samples_leaf': range(1, 5),
                         'min_samples_split': [3, 6, 9], 'random_state': [5],
                         'subsample': [0.3, 0.6, 0.9]},
             return_train_score=True, scoring='recall', verbose=1)

In [10]:
GBC = grid_search_GBC.best_estimator_

In [11]:
GBC.fit(x, y)

GradientBoostingClassifier(learning_rate=0.3, max_features=6,
                           min_samples_leaf=4, min_samples_split=9,
                           n_estimators=200, random_state=5, subsample=0.9)

In [12]:
pickle.dump(GBC, open('model.pkl','wb'))


In [16]:
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))
print(model.predict([[10, 120, 2, 2, 1, 3, 1, 0, 0, 1]]))

[1]
