# Bank Dataset

In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Loading the dataset

In [11]:
data=pd.read_csv("15451939-Bank-data.csv")
data.head(10)

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
5,5,0.899,0.0,0.0,1.0,0.0,126.0,no
6,6,4.962,0.0,0.0,0.0,0.0,84.0,no
7,7,4.858,0.0,1.0,0.0,0.0,17.0,no
8,8,4.962,0.0,0.0,0.0,0.0,704.0,yes
9,9,4.865,0.0,0.0,0.0,0.0,185.0,no


In [12]:
data=data.drop(["Unnamed: 0"],axis=1)
data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [13]:
data["y"]=data["y"].map({"yes":1,"no":0})

In [14]:
data.describe(include="all")

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


In [15]:
data.isna().sum()

interest_rate    0
credit           0
march            0
may              0
previous         0
duration         0
y                0
dtype: int64

## Regression
### Split the data

In [16]:
y=data["y"]
x=data.drop(["y"],axis=1)
x

Unnamed: 0,interest_rate,credit,march,may,previous,duration
0,1.334,0.0,1.0,0.0,0.0,117.0
1,0.767,0.0,0.0,2.0,1.0,274.0
2,4.858,0.0,1.0,0.0,0.0,167.0
3,4.120,0.0,0.0,0.0,0.0,686.0
4,4.856,0.0,1.0,0.0,0.0,157.0
...,...,...,...,...,...,...
513,1.334,0.0,1.0,0.0,0.0,204.0
514,0.861,0.0,0.0,2.0,1.0,806.0
515,0.879,0.0,0.0,0.0,0.0,290.0
516,0.877,0.0,0.0,5.0,1.0,473.0


### Scale the data

In [17]:
scaler=StandardScaler()
scaler.fit(x)
x_scaled=scaler.transform(x)
x_scaled

array([[-0.80090846, -0.18973666,  1.65940447, -0.47684853, -0.38212262,
        -0.77094694],
       [-1.10329382, -0.18973666, -0.60262583,  1.98093794,  2.61696099,
        -0.31450316],
       [ 1.07846723, -0.18973666,  1.65940447, -0.47684853, -0.38212262,
        -0.62558268],
       ...,
       [-1.04356338, -0.18973666, -0.60262583, -0.47684853, -0.38212262,
        -0.26798659],
       [-1.04463   , -0.18973666, -0.60262583,  5.66761764,  2.61696099,
         0.26404661],
       [ 1.13553113, -0.18973666, -0.60262583, -0.47684853, -0.38212262,
        -0.69826481]])

### Train test split

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.2,random_state=42)

In [25]:
reg=LogisticRegression()
reg.fit(x_train,y_train)
reg.get_params(deep=True)

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [22]:
y_pred=reg.predict(x_test)
y_pred

array([1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=int64)

### Confusion Matrix

In [26]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

[[54  5]
 [ 9 36]]


In [34]:
print(sum(cm))

63


In [35]:
accuracy=(cm[0,0]+cm[1,1])/(sum(cm)[0]+sum(cm)[1])*100
accuracy

0.8653846153846154

This is so much better than statsmodel