In [1]:
# Add needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Preprocessing
from sklearn.preprocessing import OneHotEncoder

# VIF for multi-collinearity detection
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Models and modeling tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm

# Models and modeling tools
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Change inline plots default size
plt.rcParams['figure.figsize'] = [14, 10]

In [11]:
# Read in the rawData
rawData = pd.read_csv('term-deposit-marketing-2020.csv')


# Separate data in independent (features) and dependent (target) datasets
features = rawData.copy()
target = features.pop('y')


# Replace binary columns 1/0
for col in ['default', 'housing', 'loan']:
    features[col].replace({'yes':1,'no':0}, inplace=True)

    
# Encode small categorical columns
cols = ['marital', 'education', 'contact']

enc = OneHotEncoder(sparse=False, dtype=int)
encoded = enc.fit_transform(features[cols])

# There has got to be a better way to do this...
encNames = enc.get_feature_names()
for index, item in enumerate(encNames):
    for i in range(len(cols)):
        encNames[index] = encNames[index].replace('x' + str(i), cols[i])

encDF = pd.DataFrame(encoded)
encDF.columns = encNames

features = features.drop(columns=cols)
features = pd.concat([features, encDF], axis=1)


# Encode large categorical columns
cols = ['month', 'job']

enc = OneHotEncoder(sparse=False, dtype=int)
encoded = enc.fit_transform(features[cols])

# There has got to be a better way to do this...
encNames = enc.get_feature_names()
for index, item in enumerate(encNames):
    for i in range(len(cols)):
        encNames[index] = encNames[index].replace('x' + str(i), cols[i])

encDF = pd.DataFrame(encoded)
encDF.columns = encNames

features = features.drop(columns=cols)
features = pd.concat([features, encDF], axis=1)


# Remove duration (this is not a reasonable thing to have in the model as it should tell success/failure well?)
#features = features.drop(columns=['duration'])

In [14]:
oversample = SMOTE()
ovFeatures, ovTarget = oversample.fit_resample(features, target)

In [17]:
## Look at Mutual Information metric to assess weak relationships between features and target
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y):
    mi_scores = mutual_info_classif(X, y, discrete_features=True)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(ovFeatures, np.ravel(ovTarget))
display (mi_scores)

duration               0.265438
balance                0.107446
contact_unknown        0.071876
marital_married        0.057423
housing                0.048736
month_may              0.045916
job_blue-collar        0.038855
education_secondary    0.036857
loan                   0.031643
education_primary      0.031251
day                    0.025764
job_technician         0.025348
month_aug              0.025161
month_jul              0.024150
month_jun              0.022122
job_management         0.019505
job_services           0.019485
month_nov              0.016413
campaign               0.016222
job_admin              0.016129
marital_divorced       0.015896
education_tertiary     0.011221
age                    0.011060
contact_telephone      0.010104
marital_single         0.009640
month_jan              0.008653
education_unknown      0.008025
job_entrepreneur       0.007848
job_housemaid          0.006855
job_self-employed      0.006651
job_unemployed         0.005458
month_fe

In [18]:
# Look at the skewness of independent variables
features.skew()

age                     0.436080
default                 6.816736
balance                 8.259236
housing                -0.411561
loan                    1.726785
day                     0.067930
duration                3.165307
campaign                4.730901
marital_divorced        2.366427
marital_married        -0.449560
marital_single          1.023505
education_primary       1.888317
education_secondary    -0.099426
education_tertiary      0.979166
education_unknown       4.813341
contact_cellular       -0.506959
contact_telephone       3.781087
contact_unknown         0.776088
month_apr               3.433729
month_aug               2.195556
month_dec              55.445054
month_feb               3.805730
month_jan               5.571914
month_jul               1.860005
month_jun               2.363085
month_mar              12.331125
month_may               0.683557
month_nov               2.866486
month_oct              22.294378
job_admin               2.459527
job_blue-c

In [19]:
# Look at the correlations between independent variables
features.corr()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,marital_divorced,marital_married,...,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
age,1.0,-0.014857,0.081517,-0.179606,4.1e-05,-0.011689,-0.03627,0.016204,0.161989,0.26563,...,0.027956,0.08492,-0.005302,0.333164,0.001712,-0.057227,-0.155146,-0.058308,0.008052,0.043904
default,-0.014857,1.0,-0.070069,-0.019229,0.075006,0.006287,-0.008597,0.012135,0.016748,-0.016458,...,0.025645,1.7e-05,-0.000951,-0.003878,0.004233,-0.003038,-0.013429,-0.004516,0.007232,-0.006397
balance,0.081517,-0.070069,1.0,-0.049946,-0.080596,0.011662,0.014072,-0.008137,-0.02591,0.023683,...,0.011692,0.003044,0.072172,0.015917,0.015771,-0.033618,0.000575,-0.012655,0.01002,0.010672
housing,-0.179606,-0.019229,-0.049946,1.0,0.004266,-0.050622,0.022455,-0.054823,-0.004691,0.00068,...,0.003303,-0.086038,-0.067999,-0.112274,-0.032204,0.062391,-0.034034,-0.03218,-0.046197,-0.077605
loan,4.1e-05,0.075006,-0.080596,0.004266,1.0,0.005838,-0.003952,0.000582,0.015635,0.032651,...,0.038244,-0.016788,-0.037817,0.017406,-0.008934,0.031939,-0.047512,0.008396,-0.035591,-0.031734
day,-0.011689,0.006287,0.011662,-0.050622,0.005838,1.0,-0.032983,0.165429,-0.004055,0.005243,...,-0.003629,0.005355,0.026526,-0.002787,0.00508,-0.009461,-0.011183,0.031902,-0.005475,-0.009526
duration,-0.03627,-0.008597,0.014072,0.022455,-0.003952,-0.032983,1.0,-0.088387,0.007237,-0.029153,...,0.000247,-0.007239,-0.010846,0.001675,0.008821,0.003569,-0.007175,-0.008469,0.017896,-0.007971
campaign,0.016204,0.012135,-0.008137,-0.054823,0.000582,0.165429,-0.088387,1.0,-0.019951,0.027962,...,-0.002619,0.002804,0.021105,-0.013881,0.004581,-0.010404,-0.008503,0.018915,-0.017052,0.017037
marital_divorced,0.161989,0.016748,-0.02591,-0.004691,0.015635,-0.004055,0.007237,-0.019951,1.0,-0.457384,...,-0.000406,0.015053,0.01026,0.042143,-0.018049,0.018813,-0.039441,0.013693,0.008317,-0.012932
marital_married,0.26563,-0.016458,0.023683,0.00068,0.032651,0.005243,-0.029153,0.027962,-0.457384,1.0,...,0.044941,0.046431,-0.03654,0.056708,0.015806,-0.015312,-0.125955,-0.073221,-0.011904,0.014573


In [20]:
# Generate VIF factors for each feature
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = features.columns 
  
# calculating VIF for each feature 
vif_scores["VIF Scores"] = [variance_inflation_factor(features.values, i) for i in range(len(features.columns))] 
  
display(vif_scores)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Attribute,VIF Scores
0,age,1.494822
1,default,1.014831
2,balance,1.054258
3,housing,1.419921
4,loan,1.059531
5,day,1.445083
6,duration,1.017713
7,campaign,1.131206
8,marital_divorced,inf
9,marital_married,inf


In [70]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

target.replace({'yes':1,'no':0}, inplace=True)
summaryTab = []

for col in features:
    logReg = sm.Logit(target, features[col])
    result = logReg.fit(disp=False)
    summaryTab.append([result.pvalues.index[0], result.pvalues.values[0], result.llf])
    
tab = pd.DataFrame(summaryTab, columns=['name','pvalue','llf'])
tab.sort_values(by='llf', ascending=False)

Unnamed: 0,name,pvalue,llf
0,age,0.0,-10741.096914
5,day,0.0,-12907.388431
7,campaign,0.0,-13314.523946
9,marital_married,0.0,-16398.383057
3,housing,0.0,-16583.86288
15,contact_cellular,0.0,-17972.800393
12,education_secondary,0.0,-18365.0691
17,contact_unknown,0.0,-20978.258682
26,month_may,0.0,-21359.692919
2,balance,0.0,-22779.045265


In [49]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,40000.0
Model:,Logit,Df Residuals:,39999.0
Method:,MLE,Df Model:,0.0
Date:,"Fri, 05 Aug 2022",Pseudo R-squ.:,-1.658
Time:,14:45:34,Log-Likelihood:,-27624.0
converged:,True,LL-Null:,-10392.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
job_unknown,-2.5513,0.252,-10.132,0.000,-3.045,-2.058


In [71]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,40000.0
Model:,Logit,Df Residuals:,39999.0
Method:,MLE,Df Model:,0.0
Date:,"Fri, 05 Aug 2022",Pseudo R-squ.:,-1.658
Time:,15:01:00,Log-Likelihood:,-27624.0
converged:,True,LL-Null:,-10392.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
job_unknown,-2.5513,0.252,-10.132,0.000,-3.045,-2.058


AttributeError: 'function' object has no attribute 'xname'