In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import statsmodels.api as sm
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [4]:
raw_data = pd.read_csv('../datasets/Bank-data.csv')
data = raw_data.copy()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


__Context:__\
'interest_rate' indicates the 3-month interest rate\
'duration' indicates the time since the last contact was made with a given customer\
'previous' shows whether the last marketing campaign was successful with a given customer\
'march' and 'may' are Boolean variables that account for when the call was made to the specific customer\
'credit' shows if the customer has enough credit to avoid defaulting\

We want to know whether the bank marketing strategy was successful.

In [11]:
data = data.drop(['Unnamed: 0'], axis=1)

In [13]:
data['y'] = data['y'].map({'yes':1,'no':0})

In [15]:
data.describe(include='all')

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,2.838681,0.034816,0.26499,0.388781,0.12766,382.690522,0.500967
std,1.877555,0.183492,0.441755,0.815137,0.334034,344.431293,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.041,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.479,0.0,0.0,0.0,0.0,267.0,1.0
75%,4.957,0.0,1.0,0.0,0.0,483.0,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


In [29]:
y = data['y']
feature_names = list(data.columns.values)
feature_names.remove('y')
x = data[feature_names]

In [33]:
x = sm.add_constant(x)
reg_logit = sm.Logit(y,x)
results_logit = reg_logit.fit()
print(results_logit.summary())

Optimization terminated successfully.
         Current function value: 0.336381
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  517
Model:                          Logit   Df Residuals:                      510
Method:                           MLE   Df Model:                            6
Date:                Tue, 17 Jan 2023   Pseudo R-squ.:                  0.5147
Time:                        22:16:12   Log-Likelihood:                -173.91
converged:                       True   LL-Null:                       -358.36
Covariance Type:            nonrobust   LLR p-value:                 1.352e-76
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.1335      0.340     -0.393      0.694      -0.799       0.532
interest_rate    -0.

In [34]:
def confusion_matrix(data,actual_values,model):
        pred_values = model.predict(data)
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

In [35]:
confusion_matrix(x,y,results_logit)

(array([[219.,  39.],
        [ 31., 228.]]),
 0.8646034816247582)

### Testing

In [50]:
raw_data2 = pd.read_csv('../datasets/Bank-data.csv')
data_test = raw_data2.copy()

In [51]:
data_test = data_test.drop(['Unnamed: 0'], axis=1)

In [52]:
data_test['y'] = data_test['y'].map({'yes':1,'no':0})

In [53]:
data_test

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.120,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0,0
514,0.861,0.0,0.0,2.0,1.0,806.0,1
515,0.879,0.0,0.0,0.0,0.0,290.0,0
516,0.877,0.0,0.0,5.0,1.0,473.0,1


In [54]:
y_test = data_test['y']
x_test = data_test[feature_names]

In [55]:
x_test = sm.add_constant(x_test)

In [56]:
confusion_matrix(x_test,y_test,results_logit)

(array([[220.,  39.],
        [ 31., 228.]]),
 0.8648648648648649)