In [92]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## may need to install smote: "!pip install smote-variants"
import smote_variants as sv

In [93]:
filename = 'bank-full.csv'

In [94]:
bankData = pd.read_csv(filename, sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [95]:
bankData.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,45211.0,40.93621,10.618762,18.0,33.0,39.0,48.0,95.0
balance,45211.0,1362.272058,3044.765829,-8019.0,72.0,448.0,1428.0,102127.0
day,45211.0,15.806419,8.322476,1.0,8.0,16.0,21.0,31.0
duration,45211.0,258.16308,257.527812,0.0,103.0,180.0,319.0,4918.0
campaign,45211.0,2.763841,3.098021,1.0,1.0,2.0,3.0,63.0
pdays,45211.0,40.197828,100.128746,-1.0,-1.0,-1.0,-1.0,871.0
previous,45211.0,0.580323,2.303441,0.0,0.0,0.0,0.0,275.0


### 1. Preprocessing

Would typically do some EDA before this step.

In [96]:
## Scale the numeric columns: Age, Balance, Duration

bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

In [97]:
bankData.drop(['age', 'balance', 'duration'], axis=1, inplace=True)
bankData.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,management,married,tertiary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,1.266667,1.25,0.375
1,technician,single,secondary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,blue-collar,married,unknown,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,unknown,single,unknown,no,no,no,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [98]:
## Create Dummy Variables for the categorical variables

dataCat = pd.get_dummies(bankData[['job', 'marital', 'education', 'default', 'housing',\
                                 'loan', 'contact', 'month', 'poutcome']])


In [99]:
dataCat.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [100]:
bankNum = bankData[['ageScaled', 'balScaled', 'day', 'durScaled', 'campaign', 'pdays', 'previous']]
bankNum.shape

(45211, 7)

In [101]:
dataCat.shape

(45211, 44)

In [102]:
X = pd.concat([bankNum, dataCat], axis = 1)
display(X.shape)
y = bankData['y']
print(y.shape)

(45211, 51)

(45211,)


### 2. Logistic Regression Model

In [104]:
# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [105]:
# Fit the model
bankModel = LogisticRegression()
bankModel.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [106]:
# Predict on Test Set
pred = bankModel.predict(X_test)
display(bankModel.score(X_test, y_test))

0.898997345915659

In [107]:
display(bankModel.score(X_train, y_train))

0.9015704490157045

In [108]:
cf_matrix = confusion_matrix(y_test, pred)
print(cf_matrix)
print(classification_report(y_test, pred))

[[11702   296]
 [ 1074   492]]
              precision    recall  f1-score   support

          no       0.92      0.98      0.94     11998
         yes       0.62      0.31      0.42      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.64      0.68     13564
weighted avg       0.88      0.90      0.88     13564



The classifier is clearly biased towards one class. 62% precision and 31% recall on predicting the "Yes". 

In [109]:
print((y_train[y_train=='yes'].value_counts() / len(y_train))*100)
print((y_train[y_train=='no'].value_counts() / len(y_train))*100)

yes    11.764148
Name: y, dtype: float64
no    88.235852
Name: y, dtype: float64


Biased Training Data. Deceptive results if using accuracy metrics. This will always bias the results of the model to the majority class.

#### Solutions to this problem of biased training data: 
1. Collect more data
2. Resample the data
3. Generate synthetic samples

### 3. Resampling Methods

##### Method 1: Undersample Majority Class

In [110]:
trainData = pd.concat([X_train, y_train], axis = 1)
trainData.head()

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
19100,0.8,-0.162979,5,0.236111,1,-1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,no
37958,0.733333,-0.238938,14,0.865741,2,289,19,1,0,0,...,0,1,0,0,0,1,0,0,0,no
12451,0.0,0.385693,1,1.347222,3,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,no
18263,1.333333,-0.330383,31,-0.592593,8,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
5128,-0.466667,-0.14233,21,-0.435185,2,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no


In [111]:
## Seperate Minority Class from Majority Class

### Minority Class ###
## These two ways seem to do the exact same thing
minClass = trainData[trainData['y']=='yes']
ind_min = trainData[trainData['y']=='yes'].index
minClass2 = trainData.loc[ind_min]

In [112]:
display(minClass.head())
display(len(minClass))

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
10403,0.066667,-0.030973,12,1.837963,2,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
31105,0.533333,-0.083333,17,0.212963,1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
28816,-0.4,-0.109145,30,3.435185,5,-1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,yes
43073,-0.066667,0.005162,19,0.347222,2,555,6,0,0,0,...,0,0,0,0,0,0,0,1,0,yes
42221,0.933333,0.867257,11,-0.064815,3,97,7,0,0,0,...,0,0,1,0,0,0,0,1,0,yes


3723

In [113]:
### Majority Class ###
## These two ways seem to do the exact same thing
majClass = trainData[trainData['y']=='no']
ind_maj = trainData[trainData['y']=='no'].index
majClass2 = trainData.loc[ind]

In [114]:
display(majClass.head())
display(len(majClass))

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
19100,0.8,-0.162979,5,0.236111,1,-1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,no
37958,0.733333,-0.238938,14,0.865741,2,289,19,1,0,0,...,0,1,0,0,0,1,0,0,0,no
12451,0.0,0.385693,1,1.347222,3,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,no
18263,1.333333,-0.330383,31,-0.592593,8,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
5128,-0.466667,-0.14233,21,-0.435185,2,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no


27924

In [115]:
## Sample the Majority Class to have the same length as the minority class
majSample = majClass2.sample(n = len(ind_min), random_state = 123)

In [116]:
majSample.shape

(3723, 52)

In [117]:
majSample.head()

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
17387,0.666667,0.752212,28,-0.425926,3,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
34679,0.8,0.086283,5,-0.106481,7,250,3,0,1,0,...,0,1,0,0,0,1,0,0,0,no
26572,0.466667,1.785398,20,-0.134259,2,-1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,no
3280,1.2,1.972714,15,-0.009259,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no
4434,-0.133333,2.011062,20,-0.055556,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no


In [118]:
balData = pd.concat([minClass2, majSample], axis=0)
balData.head()

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
10403,0.066667,-0.030973,12,1.837963,2,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
31105,0.533333,-0.083333,17,0.212963,1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
28816,-0.4,-0.109145,30,3.435185,5,-1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,yes
43073,-0.066667,0.005162,19,0.347222,2,555,6,0,0,0,...,0,0,0,0,0,0,0,1,0,yes
42221,0.933333,0.867257,11,-0.064815,3,97,7,0,0,0,...,0,0,1,0,0,0,0,1,0,yes


In [119]:
## Need to shuffle before fitting a new model to the dataset. 
balData = shuffle(balData)
balData.head()

Unnamed: 0,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
2799,0.6,-0.312684,14,3.018519,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,yes
22481,0.466667,1.085546,22,1.962963,6,-1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,yes
35178,-1.0,-0.512537,7,0.106481,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,no
18195,-0.2,-0.272861,30,11.62963,9,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
12615,-0.533333,-0.306047,4,-0.069444,2,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,no


In [120]:
## Fit the new model.

X_trainBal = balData.iloc[:,0:51]
y_trainBal = balData['y']

bankModel1 = LogisticRegression()
bankModel1.fit(X_trainBal, y_trainBal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [121]:
preds = bankModel1.predict(X_test)
print(bankModel1.score(X_test, y_test))
print(classification_report(y_test, preds))

0.8299174284871719
              precision    recall  f1-score   support

          no       0.97      0.83      0.90     11998
         yes       0.39      0.82      0.53      1566

    accuracy                           0.83     13564
   macro avg       0.68      0.83      0.71     13564
weighted avg       0.91      0.83      0.85     13564



We have much better recall, which means the model is better at identifying the yes and labeling yes (way higher recall). . . But it comes at the expense of incorrectly identifying false positives (low precision).

From the business POV, we will likely have a higher rate of conversions given that we will be marketing to the group that are likely to convert but we will have that at the cost of increasing the number of that as good leads. 

##### Method 2: Create "more" of the minority class (i.e. creating synethic examples)

Synthetic Minority Over-Sampling Technique (SMOTE)

In [122]:
print(sum(y_train=='yes'))
print(sum(y_train=='no'))

3723
27924


In [125]:
oversampler = sv.SMOTE()

In [126]:
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

2021-11-24 15:18:07,703:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


In [127]:
X_train_os.shape

(55848, 51)

In [130]:
print(sum(y_train_os=='yes'))
print(sum(y_train_os=='no'))

27924
27924


Equal parts Yes / No based on SMOTE method. 

In [131]:
bankModel2 = LogisticRegression()
bankModel2.fit(X_train_os, y_train_os)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [136]:
pred = bankModel2.predict(X_test)
print('Accuracy on Testing Dataset: {:.2f}'.format(bankModel2.score(X_test, y_test)))

Accuracy on Testing Dataset: 0.84


In [134]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.97      0.84      0.90     11998
         yes       0.40      0.81      0.54      1566

    accuracy                           0.84     13564
   macro avg       0.69      0.82      0.72     13564
weighted avg       0.91      0.84      0.86     13564



Results are similar to the undersamplying method. BUT recall is slightly lower and precision is slightly higher. 

In [137]:
confusion_matrix(y_test, pred)

array([[10129,  1869],
       [  305,  1261]])

##### Method 3: Create "more" of the minority class (i.e. creating synethic examples)

Modified Synthetic Minority Over-Sampling Technique (SMOTE)

In [139]:
oversampler = sv.MSMOTE()
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

bankModel4 = LogisticRegression()
bankModel4.fit(X_train_os, y_train_os)

preds = bankModel4.predict(X_test)
print('Accuracy Score for MSMOTE: {:.2f}'.format(bankModel4.score(X_test, y_test)))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

2021-11-24 15:29:38,147:INFO:MSMOTE: Running sampling via ('MSMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Score for MSMOTE: 0.84
              precision    recall  f1-score   support

          no       0.97      0.84      0.90     11998
         yes       0.40      0.79      0.53      1566

    accuracy                           0.84     13564
   macro avg       0.68      0.82      0.72     13564
weighted avg       0.90      0.84      0.86     13564

[[10127  1871]
 [  330  1236]]


Very similar results between SMOTE and MSMOTE. 

Overall, these technqiues seem to help the model improve the recall score at the expense of precision and accuracy.