In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_auc_score, roc_curve, recall_score

In [2]:
data = pd.read_csv('datasets/logistic-regression/bank_selected_features.csv')
data.head()

Unnamed: 0,duration,poutcome_success,month_oct,contact_unknown,month_mar,month_jan,day,month_nov,month_jul,loan_yes,marital_married,poutcome_other,month_sep,job_retired,y_yes
0,79,0,1,0,0,0,19,0,0,0,1,0,0,0,0
1,220,0,0,0,0,0,11,0,0,1,1,0,0,0,0
2,185,0,0,0,0,0,16,0,0,0,0,0,0,0,0
3,199,0,0,1,0,0,3,0,0,1,1,0,0,0,0
4,226,0,0,1,0,0,5,0,0,0,1,0,0,0,0


In [3]:
data['y_yes'].value_counts() / len(data)

0    0.88476
1    0.11524
Name: y_yes, dtype: float64

# Up-Sampling

In [4]:
data['y_yes'].value_counts() / len(data)

0    0.88476
1    0.11524
Name: y_yes, dtype: float64

In [5]:
# split the data into two parts
# sample = 0 and sample = 1

data_no = data[data.y_yes == 0]
data_yes = data[data.y_yes == 1]


In [6]:
from sklearn.utils import resample

In [7]:
data_yes_resample = resample(data_yes, replace = True, n_samples = 2000)
data_yes_resample

Unnamed: 0,duration,poutcome_success,month_oct,contact_unknown,month_mar,month_jan,day,month_nov,month_jul,loan_yes,marital_married,poutcome_other,month_sep,job_retired,y_yes
4494,576,0,0,1,0,0,28,0,0,0,0,0,0,0,1
1212,207,0,0,0,0,0,28,0,0,0,0,0,0,0,1
1928,332,0,1,0,0,0,20,0,0,0,0,0,0,0,1
2077,325,0,0,0,1,0,17,0,0,0,0,0,0,0,1
99,736,0,0,0,0,0,4,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3321,630,0,0,0,0,0,31,0,1,0,0,0,0,0,1
2980,1007,0,0,1,0,0,19,0,0,1,0,0,0,1,1
1994,712,0,0,0,0,0,22,0,1,0,1,0,0,0,1
53,441,0,0,0,0,0,15,0,0,0,1,1,0,0,1


In [8]:
data_resample = pd.concat([data_no, data_yes_resample], axis = 0)
data_resample

Unnamed: 0,duration,poutcome_success,month_oct,contact_unknown,month_mar,month_jan,day,month_nov,month_jul,loan_yes,marital_married,poutcome_other,month_sep,job_retired,y_yes
0,79,0,1,0,0,0,19,0,0,0,1,0,0,0,0
1,220,0,0,0,0,0,11,0,0,1,1,0,0,0,0
2,185,0,0,0,0,0,16,0,0,0,0,0,0,0,0
3,199,0,0,1,0,0,3,0,0,1,1,0,0,0,0
4,226,0,0,1,0,0,5,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3321,630,0,0,0,0,0,31,0,1,0,0,0,0,0,1
2980,1007,0,0,1,0,0,19,0,0,1,0,0,0,1,1
1994,712,0,0,0,0,0,22,0,1,0,1,0,0,0,1
53,441,0,0,0,0,0,15,0,0,0,1,1,0,0,1


In [9]:
data_resample.y_yes.value_counts() / len(data_resample)

0    0.666667
1    0.333333
Name: y_yes, dtype: float64

In [10]:
# split the data into independent and dependent
X = data_resample.drop(['y_yes'], axis = 1) # independent features
y = data_resample.y_yes # response variable

In [11]:
# splitting the data into train and test
np.random.seed(1001)
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4200, 14), (1800, 14), (4200,), (1800,))

## Build the Model

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [15]:
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Maximum Likelihood Estimation

In [16]:
y_train.head()

3050    0
3817    1
2860    0
1677    0
4367    0
Name: y_yes, dtype: int64

In [17]:
# training data
y_pred_train = model.predict_proba(x_train)
y_pred_train

array([[0.91710755, 0.08289245],
       [0.66210186, 0.33789814],
       [0.56118673, 0.43881327],
       ...,
       [0.95720423, 0.04279577],
       [0.13313156, 0.86686844],
       [0.97267313, 0.02732687]])

In [18]:
y_pred_train_1 = y_pred_train[:, 1]
y_pred_train_1

array([0.08289245, 0.33789814, 0.43881327, ..., 0.04279577, 0.86686844,
       0.02732687])

In [19]:
from sklearn.metrics import log_loss
def predict_threshold(y, a):
    if y >= a:
        return 1
    return 0


In [20]:
y_pred_train_prob_1 = y_pred_train_1
thresholds = np.linspace(0, 1, 100)
mle = []
for threshold in thresholds:
    loss = []
    for y in y_pred_train_1:
        y_pred_0_5 = predict_threshold(y, threshold)
        loss.append(y_pred_0_5)
        
    loss = pd.Series(loss)
    loss_act = log_loss(y_train, loss)
    mle.append(loss_act)
        

In [21]:
mle

[23.108621080589966,
 22.977041743476285,
 22.45072439502157,
 20.838877515378996,
 19.391504616748183,
 18.05926382847218,
 17.064195091549973,
 16.003336686070934,
 15.197411913587262,
 14.383263622914328,
 13.577339802332359,
 12.648060162825978,
 11.924371524136651,
 11.069104309855012,
 10.427652185720715,
 9.79442548357909,
 9.226988069233622,
 8.774682574862624,
 8.43750652552167,
 8.009869872295402,
 7.664471256666885,
 7.483543766345019,
 7.179265787748272,
 7.006565052081461,
 6.891430276401882,
 6.702283075497563,
 6.4473475386987875,
 6.3651083588189925,
 6.291091935607099,
 6.2088523749666225,
 6.134836713276092,
 6.175951067756625,
 6.151276134440999,
 6.134826623118044,
 6.175941358359259,
 6.11837425608998,
 6.1677125095204595,
 6.1019216986815765,
 6.0525779242212225,
 5.9621167491948865,
 5.896325747975664,
 5.8552058724652545,
 5.904544887417095,
 5.871648720476292,
 5.814081998967695,
 5.855198257251634,
 5.846972264117943,
 5.855193688123462,
 5.8058508655648104,
 

In [22]:
mle_nparr = np.array(mle)
min(mle_nparr)

5.8058508655648104

In [23]:
thresholds[mle_nparr.argmin()]

0.48484848484848486

In [24]:
mle_nparr[43]

5.871648720476292

In [25]:
max(mle_nparr)

23.108621080589966