In [3]:
#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')

#Packages related to data importing, manipulation, exploratory data analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pandas_profiling
import scipy.stats as stats

#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()

from matplotlib.backends.backend_pdf import PdfPages

#Modules related to split the data & gridsearch
from sklearn.model_selection import train_test_split, GridSearchCV

#Module related to calculation of metrics
from sklearn import metrics

#Module related to VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Modules related to preprocessing (Imputation of missings, standardiszation, new features creation, converting categorical to numerical)
from sklearn.impute import MissingIndicator, SimpleImputer
#from sklearn.preprocessing import Imputer, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder

#Moudles related to feature selection
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, SelectPercentile, f_classif, mutual_info_classif, f_regression, VarianceThreshold, SelectFromModel, mutual_info_classif, mutual_info_regression, SelectFpr, SelectFdr, SelectFwe


#Modules related to pipe line creation for faster processing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
#from sklearn.features.transformers import DataFrameSelector

#Modules related to saving python objects permanantly
from sklearn.externals import joblib

#Dumping model into current directory: joblib.dump(model_xg,"my_model.pkl") 
#Loading model: my_model_loaded=joblib.load("my_model.pkl")

#Modules related key techniques of supervised learning 
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa

from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from xgboost import XGBClassifier, XGBRegressor
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor

XGBoostLibraryNotFound: Cannot find XGBoost Library in the candidate path, did you install compilers and run build.sh in root path?
List of candidates:
C:\Users\om\AppData\Roaming\Python\Python37\site-packages\xgboost\xgboost.dll
C:\Users\om\AppData\Roaming\Python\Python37\site-packages\xgboost\../../lib/xgboost.dll
C:\Users\om\AppData\Roaming\Python\Python37\site-packages\xgboost\./lib/xgboost.dll
C:\Users\om\Anaconda3\xgboost\xgboost.dll
C:\Users\om\AppData\Roaming\Python\Python37\site-packages\xgboost\../../windows/x64/Release/xgboost.dll
C:\Users\om\AppData\Roaming\Python\Python37\site-packages\xgboost\./windows/x64/Release/xgboost.dll

<Figure size 432x216 with 0 Axes>

In [2]:
data = pd.read_csv('bank-additional.csv', sep=";")

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [4]:
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [5]:
profile_report = pandas_profiling.ProfileReport(data)

In [6]:
profile_report.to_file("profile_report.html")

In [7]:
data.drop(['default', 'euribor3m', 'nr.employed', 'cons.price.idx', 'pdays'], axis=1, inplace=True)

In [8]:
data.columns

Index(['age', 'job', 'marital', 'education', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'duration', 'campaign', 'previous', 'poutcome',
       'emp.var.rate', 'cons.conf.idx', 'y'],
      dtype='object')

In [9]:
cat_var = data[['job', 'marital', 'education', 'housing', 'loan', 'month', 'day_of_week','contact', 'poutcome', 'y']]
num_var = data[['age',  'duration', 'emp.var.rate', 'cons.conf.idx',  'previous', 'campaign']]

In [10]:
def missing_cat(x):
    t = x.value_counts()
    x[x=='unknown'] = t.index[0]
    return x

def missing_num(x):
    x[x==999] = np.nan
    x = x.fillna(x.median())
    return x

def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x

In [11]:
num_var = num_var.apply(missing_num)
num_var = num_var.apply(outlier_capping)
cat_var = cat_var.apply(missing_cat)

In [12]:
data_final = pd.concat([num_var, cat_var],axis=1)

In [13]:
data_final = pd.get_dummies(data_final, columns = cat_var.columns,  drop_first=True)

In [14]:
data_final.columns = [i.replace('.', '_') for i in data_final.columns]

In [15]:
data_final.columns = [i.replace('-', '_') for i in data_final.columns]

In [16]:
data_final.apply(lambda x: np.std(x)/np.mean(x))

age                               0.249547
duration                          0.911392
emp_var_rate                     18.393391
cons_conf_idx                    -0.113200
previous                          2.715155
campaign                          0.897168
job_blue_collar                   1.912983
job_entrepreneur                  5.179873
job_housemaid                     6.037007
job_management                    3.422421
job_retired                       4.879882
job_self_employed                 4.990557
job_services                      3.079110
job_student                       7.016531
job_technician                    2.227314
job_unemployed                    6.009002
marital_married                   0.796570
marital_single                    1.603876
education_basic_6y                4.131076
education_basic_9y                2.485148
education_high_school             1.863414
education_illiterate             64.171645
education_professional_course     2.588255
education_u

In [17]:
data_final.head()

Unnamed: 0,age,duration,emp_var_rate,cons_conf_idx,previous,campaign,job_blue_collar,job_entrepreneur,job_housemaid,job_management,...,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,contact_telephone,poutcome_nonexistent,poutcome_success,y_yes
0,30,487.0,-1.8,-46.2,0,2,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,39,346.0,1.1,-36.4,0,4,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,25,227.0,1.4,-41.8,0,1,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
3,38,17.0,1.4,-41.8,0,3,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,47,58.0,-0.1,-42.0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [18]:
data_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 43 columns):
age                              4119 non-null int64
duration                         4119 non-null float64
emp_var_rate                     4119 non-null float64
cons_conf_idx                    4119 non-null float64
previous                         4119 non-null int64
campaign                         4119 non-null int64
job_blue_collar                  4119 non-null uint8
job_entrepreneur                 4119 non-null uint8
job_housemaid                    4119 non-null uint8
job_management                   4119 non-null uint8
job_retired                      4119 non-null uint8
job_self_employed                4119 non-null uint8
job_services                     4119 non-null uint8
job_student                      4119 non-null uint8
job_technician                   4119 non-null uint8
job_unemployed                   4119 non-null uint8
marital_married                  4119 n

In [19]:
data_final.columns

Index(['age', 'duration', 'emp_var_rate', 'cons_conf_idx', 'previous',
       'campaign', 'job_blue_collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self_employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'marital_married',
       'marital_single', 'education_basic_6y', 'education_basic_9y',
       'education_high_school', 'education_illiterate',
       'education_professional_course', 'education_university_degree',
       'housing_yes', 'loan_yes', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'contact_telephone', 'poutcome_nonexistent',
       'poutcome_success', 'y_yes'],
      dtype='object')

In [20]:
somerced_val = pd.DataFrame()
for i_var in data_final.columns.difference(['y_yes']):
    model = smf.logit('y_yes~'+str(i_var), data=data_final).fit()
    somerce_d = 2*metrics.roc_auc_score(data_final.y_yes, model.predict())-1
    temp = pd.DataFrame([i_var, somerce_d]).T
    somerced_val = pd.concat([somerced_val, temp], axis=0)

Optimization terminated successfully.
         Current function value: 0.344178
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.341558
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.343930
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.334952
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.345418
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.345447
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.345451
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.345421
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.277749
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.345048
  

In [21]:
somerced_val.columns = ['var', 'SomerceD']

In [22]:
somerced_val.sort_values(by = 'SomerceD', inplace=True, ascending=False)

In [23]:
somerced_val.to_csv('somerce_d.csv')

In [24]:
#RFE
X = data_final[data_final.columns.difference(['y_yes'])]
y = data_final[['y_yes']]

rfe = RFE(RandomForestClassifier(), 10)
rfe = rfe.fit(X, y)

In [25]:
X.columns

Index(['age', 'campaign', 'cons_conf_idx', 'contact_telephone',
       'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'duration', 'education_basic_6y',
       'education_basic_9y', 'education_high_school', 'education_illiterate',
       'education_professional_course', 'education_university_degree',
       'emp_var_rate', 'housing_yes', 'job_blue_collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self_employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'loan_yes', 'marital_married', 'marital_single', 'month_aug',
       'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',
       'month_nov', 'month_oct', 'month_sep', 'poutcome_nonexistent',
       'poutcome_success', 'previous'],
      dtype='object')

In [26]:
X.columns[rfe.get_support()]

Index(['age', 'campaign', 'cons_conf_idx', 'duration',
       'education_university_degree', 'emp_var_rate', 'housing_yes',
       'marital_married', 'poutcome_success', 'previous'],
      dtype='object')

In [27]:
#SelectKbest
SKB = SelectKBest(f_classif, k=10).fit(X, y )

In [28]:
X.columns[SKB.get_support()]

Index(['contact_telephone', 'duration', 'emp_var_rate', 'month_dec',
       'month_mar', 'month_oct', 'month_sep', 'poutcome_nonexistent',
       'poutcome_success', 'previous'],
      dtype='object')

In [29]:
#Information value calculation
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [30]:
for col in data_final.columns:
    if col == 'y_yes': continue
    else:
        print('WoE and IV for column: {}'.format(col))
        df, iv = calculate_woe_iv(data_final, col, 'y_yes')
        print(df)
        print('IV score: {:.2f}'.format(iv))
        print('\n')

WoE and IV for column: age
    Value  All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
40     64    5     2    3    0.000545   0.006652 -2.501400  0.015275
42     66    6     3    3    0.000818   0.006652 -2.095934  0.012228
24     68   45    23   22    0.006270   0.048780 -2.051483  0.087209
38     61    7     4    3    0.001091   0.006652 -1.808252  0.010056
43     62    5     3    2    0.000818   0.004435 -1.690469  0.006114
41     63    4     3    1    0.000818   0.002217 -0.997322  0.001396
39     67    4     3    1    0.000818   0.002217 -0.997322  0.001396
27     60   31    25    6    0.006816   0.013304 -0.668818  0.004339
15     50   85    71   14    0.019357   0.031042 -0.472312  0.005519
33     59   49    41    8    0.011178   0.017738 -0.461804  0.003030
37     26   62    52   10    0.014177   0.022173 -0.447276  0.003577
26     58   64    54   10    0.014722   0.022173 -0.409536  0.003051
20     33  170   145   25    0.039531   0.055432 -0.338077  0.005376
7      

   Value   All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
1      0  3235  2845  390    0.775627   0.864745 -0.108763  0.009693
0      1   884   823   61    0.224373   0.135255  0.506148  0.045107
IV score: 0.05


WoE and IV for column: job_entrepreneur
   Value   All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  3971  3528  443    0.961832   0.982262 -0.021018  0.000429
1      1   148   140    8    0.038168   0.017738  0.766266  0.015655
IV score: 0.02


WoE and IV for column: job_housemaid
   Value   All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  4009  3569  440     0.97301    0.97561 -0.002668  0.000007
1      1   110    99   11     0.02699    0.02439  0.101290  0.000263
IV score: 0.00


WoE and IV for column: job_management
   Value   All  Good  Bad  Distr_Good  Distr_Bad       WoE        IV
0      0  3795  3374  421    0.919847   0.933481 -0.014713  0.000201
1      1   324   294   30    0.080153   0.066519  0.186448  0.002542
IV sco

In [31]:
#Final List

Final_list = [
    'duration',
'emp_var_rate',
'previous',
'poutcome_nonexistent',
'contact_telephone',
'poutcome_success',
'month_may',
#'campaign',
'job_blue_collar',
#'cons_conf_idx',
#'age',
'day_of_week_thu',
'month_dec',
'housing_yes',
#'marital_married',
'month_mar',
'month_oct',
'month_sep'

]

In [32]:
X_new = X[Final_list]
y = y

In [33]:
range(X_new.shape[1])

range(0, 14)

In [34]:
### VIF Calculation for variables
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]
vif["features"] = X_new.columns

In [35]:
vif.sort_values(by='VIF_Factor',ascending=False)

Unnamed: 0,VIF_Factor,features
3,4.417267,poutcome_nonexistent
4,2.306712,contact_telephone
10,2.204365,housing_yes
0,2.162618,duration
6,2.016512,month_may
2,2.00677,previous
1,1.75467,emp_var_rate
5,1.42976,poutcome_success
7,1.297209,job_blue_collar
8,1.260153,day_of_week_thu


In [36]:
data_final1 = pd.concat([X_new, y], axis=1)

In [37]:
#split the data into train & test (70%:30%)
train, test = train_test_split(data_final1, test_size = 0.3, random_state=123 )

In [38]:
train.shape

(2883, 15)

In [39]:
test.shape

(1236, 15)

In [40]:
eqn = 'y_yes~ '+'+'.join(X_new.columns.difference(['contact_telephone', 'month_oct', 'housing_yes', 'day_of_week_thu', 'month_dec', 'job_blue_collar' ]))

In [41]:
#Implementation Model building
#Logistic Regression

logit_model = smf.logit(formula = eqn, data=train).fit()

Optimization terminated successfully.
         Current function value: 0.198051
         Iterations 8


In [42]:
print(logit_model.summary())

                           Logit Regression Results                           
Dep. Variable:                  y_yes   No. Observations:                 2883
Model:                          Logit   Df Residuals:                     2874
Method:                           MLE   Df Model:                            8
Date:                Fri, 20 Dec 2019   Pseudo R-squ.:                  0.4160
Time:                        09:10:55   Log-Likelihood:                -570.98
converged:                       True   LL-Null:                       -977.64
Covariance Type:            nonrobust   LLR p-value:                2.773e-170
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -5.3222      0.422    -12.600      0.000      -6.150      -4.494
duration                 0.0055      0.000     17.747      0.000       0.005       0.006
emp_var_rate

#Mathematical equation:
LE =  0.0055*duration -0.6566*emp_var_rate+2.2181*month_mar-0.9119*month_may+0.9128*month_sep+0.9652*poutcome_nonexistent+2.1440*poutcome_success+0.5657*previous-5.3222
P(y_yes=1) = exp(LE)/(1+exp(LE))

In [43]:
train['pred_prob'] =logit_model.predict(train)

In [44]:
test['pred_prob'] =logit_model.predict(test)

In [45]:
train_Gini = 2*metrics.roc_auc_score(train.y_yes, train.pred_prob)-1
print(train_Gini)

0.8690821818034677


In [46]:
test_Gini = 2*metrics.roc_auc_score(test.y_yes, test.pred_prob)-1
print(test_Gini)

0.8619569088319088


In [47]:
train.y_yes.count()

2883

In [49]:
roc_df = pd.DataFrame()
for cut_off in np.linspace(0,1):
    #roc_df_temp['cut_off'] = cut_off
    train['y_pred'] = np.where(train.pred_prob>cut_off, 1, 0)
    train['TP'] = np.where(((train.y_yes ==1) & (train.y_pred==1)), 1, 0)
    train['TN'] = np.where(((train.y_yes ==0) & (train.y_pred==0)), 1, 0)
    train['FP'] = np.where(((train.y_yes ==0) & (train.y_pred==1)), 1, 0)
    train['FN'] = np.where(((train.y_yes ==1) & (train.y_pred==0)), 1, 0)
    sensitivity = train.TP.sum()/train.y_yes.sum()
    specificity = train.TN.sum()/(1-train.y_yes).sum()
    accuracy = (train.TN.sum()+train.TP.sum())/train.y_yes.count()
    roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity, accuracy]).T
    roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity', 'accuracy']
    roc_df = pd.concat([roc_df, roc_like_table], axis=0)
    

In [50]:
roc_df['total'] = roc_df.sensitivity + roc_df.specificity

In [51]:
roc_df[roc_df.total == roc_df.total.max()]

Unnamed: 0,cutoff,sensitivity,specificity,accuracy,total
0,0.081633,0.934853,0.811724,0.824835,1.746577


In [52]:
#train.y_yes.mean()

In [53]:
#Best Cut-off = 0.082

In [54]:
train['y_pred'] = np.where(train.pred_prob>0.082, 1, 0)
test['y_pred'] = np.where(test.pred_prob>0.082, 1, 0)

In [55]:
print(metrics.classification_report(train.y_yes, train.y_pred))

              precision    recall  f1-score   support

           0       0.99      0.81      0.89      2576
           1       0.37      0.93      0.53       307

    accuracy                           0.83      2883
   macro avg       0.68      0.87      0.71      2883
weighted avg       0.92      0.83      0.85      2883



In [56]:
print(metrics.classification_report(test.y_yes, test.y_pred))

              precision    recall  f1-score   support

           0       0.98      0.82      0.90      1092
           1       0.40      0.90      0.56       144

    accuracy                           0.83      1236
   macro avg       0.69      0.86      0.73      1236
weighted avg       0.92      0.83      0.86      1236



In [57]:
import pickle

In [58]:
with open('logit_model.pkl', 'wb') as f:
    pickle.dump(logit_model, f)

In [59]:
train_X, test_X, train_y, test_y = train_test_split(X_new, y, test_size=0.3, random_state=123)

In [60]:

#Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=10)
dt_model.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [61]:
2*metrics.roc_auc_score(train_y, pd.DataFrame(dt_model.predict_proba(train_X))[1])-1

0.8414113237704091

In [62]:
2*metrics.roc_auc_score(test_y, pd.DataFrame(dt_model.predict_proba(test_X))[1])-1

0.8017780830280832

In [63]:
train_pred = np.where(pd.DataFrame(dt_model.predict_proba(train_X))[1]>0.10, 1, 0)
test_pred = np.where(pd.DataFrame(dt_model.predict_proba(test_X))[1]>0.10, 1, 0)

In [64]:
print(metrics.classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.99      0.84      0.91      2576
           1       0.40      0.91      0.56       307

    accuracy                           0.85      2883
   macro avg       0.70      0.87      0.73      2883
weighted avg       0.93      0.85      0.87      2883



In [65]:
print(metrics.classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.91      1092
           1       0.44      0.86      0.58       144

    accuracy                           0.86      1236
   macro avg       0.71      0.86      0.75      1236
weighted avg       0.92      0.86      0.88      1236



In [66]:
train_X.columns

Index(['duration', 'emp_var_rate', 'previous', 'poutcome_nonexistent',
       'contact_telephone', 'poutcome_success', 'month_may', 'job_blue_collar',
       'day_of_week_thu', 'month_dec', 'housing_yes', 'month_mar', 'month_oct',
       'month_sep'],
      dtype='object')

In [67]:
dt_model.feature_importances_

array([0.5518749 , 0.12035175, 0.        , 0.        , 0.        ,
       0.24791744, 0.07985591, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [68]:
feature_improtance = pd.concat([pd.Series(train_X.columns), pd.Series(dt_model.feature_importances_)], axis=1)

In [69]:
feature_improtance.columns = ['feature', 'importance']

In [70]:
feature_improtance.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
0,duration,0.551875
5,poutcome_success,0.247917
1,emp_var_rate,0.120352
6,month_may,0.079856
2,previous,0.0
3,poutcome_nonexistent,0.0
4,contact_telephone,0.0
7,job_blue_collar,0.0
8,day_of_week_thu,0.0
9,month_dec,0.0


In [71]:
#Using Hyperparameters
param_grid = {'max_depth': [3,4,5,6], 'max_leaf_nodes': [5,6,7,8,9,10]}

dt_GCV_model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=10, scoring='f1_weighted')
dt_GCV_model.fit(train_X, train_y)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6],
                         'max_leaf_nodes': [5, 6, 7, 8, 9, 10]},
             pre_d

In [72]:
dt_GCV_model.best_params_

{'max_depth': 5, 'max_leaf_nodes': 9}

In [73]:
dt_GCV_model.best_score_

0.897691004062523

In [74]:
#sorted(sklearn.metrics.SCORERS.keys())

In [75]:
#Decision Tree Classifier
dt_model = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=9)
dt_model.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [76]:
# Random Forest

In [78]:
RF_model = RandomForestClassifier(n_estimators=50, max_depth=5, max_features=6).fit(train_X, train_y)

In [79]:
RF_model.feature_importances_

array([0.49008964, 0.17831794, 0.03637495, 0.01193079, 0.00715152,
       0.14908179, 0.04006762, 0.00819602, 0.00694701, 0.00652872,
       0.00811315, 0.0385486 , 0.00848582, 0.01016644])

In [80]:
train_X.columns

Index(['duration', 'emp_var_rate', 'previous', 'poutcome_nonexistent',
       'contact_telephone', 'poutcome_success', 'month_may', 'job_blue_collar',
       'day_of_week_thu', 'month_dec', 'housing_yes', 'month_mar', 'month_oct',
       'month_sep'],
      dtype='object')

In [83]:
metrics.roc_auc_score(train_y, pd.DataFrame(RF_model.predict_proba(train_X))[1])

0.9499387986323264

In [84]:
metrics.roc_auc_score(test_y, pd.DataFrame(RF_model.predict_proba(test_X))[1])

0.9309657356532357

In [85]:
#using gridsearch
param_grid = {'max_depth': [5,6,7,8,9,10], 'n_estimators':[50,100,200], 'max_features':[4,5,6,7,8]}
RF_GCV_model = GridSearchCV(RandomForestClassifier(), param_grid, cv=10, scoring='f1_weighted')
RF_GCV_model = RF_GCV_model.fit(train_X, train_y)

In [86]:
RF_GCV_model.best_params_

{'max_depth': 7, 'max_features': 7, 'n_estimators': 100}

In [87]:
RF_GCV_model.best_score_

0.9026519547922791

In [88]:
RF_GCV_model.predict_proba(train_X)

array([[0.99700643, 0.00299357],
       [0.98891132, 0.01108868],
       [0.98342552, 0.01657448],
       ...,
       [0.9961278 , 0.0038722 ],
       [0.4193995 , 0.5806005 ],
       [0.65498109, 0.34501891]])