In [187]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

### **Dataset**

In [188]:
df = pd.read_csv('bank/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### **Features**

In [189]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [190]:
df_selected_features = df[features]
df_selected_features

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


### **Data Preparation**

In [191]:
#sum of missing values per column
df_selected_features.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [192]:
#most frequent observation in the education column
df_selected_features.education.mode()

0    secondary
Name: education, dtype: object

In [193]:
df_selected_features['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [194]:
df_selected_features.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

### **Correlation**

In [195]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [196]:
#numerical columns
df_selected_features[numerical]

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [197]:
#correlation matrix
corr_matrix = df_selected_features[numerical].corr()
corr_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


###### From the above correlation matrix, it seems like the biggest correlation is between pdays & previous

### **Target Encoding**

In [198]:
df_selected_features.y.values

array(['no', 'no', 'no', ..., 'yes', 'no', 'no'],
      shape=(45211,), dtype=object)

In [199]:
#changing the yes and no values in the y column to 1s 0s
df_selected_features['y'] = df_selected_features['y'].replace({'yes': 1, 'no': 0})
df_selected_features['y']

  df_selected_features['y'] = df_selected_features['y'].replace({'yes': 1, 'no': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_features['y'] = df_selected_features['y'].replace({'yes': 1, 'no': 0})


0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

### **Split The Data**

In [200]:
from sklearn.model_selection import train_test_split

In [201]:
#splits the dataset into the training df and testing df
df_full_train, df_test = train_test_split(df_selected_features, test_size=0.2, random_state=42)

In [202]:
#splits the training dataset into the training df and validation df
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [203]:
#length of dataframes
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [204]:
df_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
20326,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown,0
24301,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown,0
38618,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown,0
18909,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown,0
23081,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13264,27,services,single,secondary,167,no,cellular,8,jul,606,2,-1,0,unknown,0
28829,40,technician,single,tertiary,693,no,cellular,30,jan,427,1,-1,0,unknown,0
3844,54,technician,divorced,secondary,0,yes,unknown,16,may,161,1,-1,0,unknown,0
15597,25,services,single,secondary,2311,no,cellular,21,jul,1105,2,-1,0,unknown,1


In [205]:
df_val

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
11019,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown,0
3403,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown,0
12260,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown,1
45110,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other,0
39549,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17743,47,technician,married,secondary,356,no,cellular,29,jul,44,5,-1,0,unknown,0
12796,32,admin.,married,secondary,1604,no,cellular,7,jul,625,1,-1,0,unknown,1
27107,45,admin.,married,secondary,857,yes,cellular,21,nov,169,1,92,20,other,0
2823,40,admin.,married,secondary,153,yes,unknown,14,may,159,2,-1,0,unknown,0


In [206]:
df_test

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3776,40,blue-collar,married,secondary,580,yes,unknown,16,may,192,1,-1,0,unknown,0
9928,47,services,single,secondary,3644,no,unknown,9,jun,83,2,-1,0,unknown,0
33409,25,student,single,tertiary,538,yes,cellular,20,apr,226,1,-1,0,unknown,0
31885,42,management,married,tertiary,1773,no,cellular,9,apr,311,1,336,1,failure,0
15738,56,management,married,tertiary,217,no,cellular,21,jul,121,2,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13353,47,management,married,tertiary,1890,no,cellular,8,jul,161,1,-1,0,unknown,0
38732,32,blue-collar,single,secondary,217,yes,cellular,15,may,692,3,-1,0,unknown,1
5654,52,admin.,divorced,secondary,0,yes,unknown,26,may,206,1,-1,0,unknown,0
3779,40,admin.,divorced,secondary,783,yes,unknown,16,may,171,2,-1,0,unknown,0


In [207]:
#If you want your shuffled df to have sequential indices (ps: it doesn't affect the models)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [208]:
df_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown,0
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown,0
2,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown,0
3,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown,0
4,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27121,27,services,single,secondary,167,no,cellular,8,jul,606,2,-1,0,unknown,0
27122,40,technician,single,tertiary,693,no,cellular,30,jan,427,1,-1,0,unknown,0
27123,54,technician,divorced,secondary,0,yes,unknown,16,may,161,1,-1,0,unknown,0
27124,25,services,single,secondary,2311,no,cellular,21,jul,1105,2,-1,0,unknown,1


In [209]:
df_val

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,38,services,divorced,secondary,-10,yes,unknown,17,jun,61,2,-1,0,unknown,0
1,42,management,single,tertiary,1146,yes,unknown,15,may,98,2,-1,0,unknown,0
2,43,management,married,tertiary,149,yes,unknown,23,jun,662,2,-1,0,unknown,1
3,50,management,married,tertiary,8205,yes,telephone,25,oct,293,3,508,1,other,0
4,43,management,married,tertiary,79,no,cellular,26,may,640,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9037,47,technician,married,secondary,356,no,cellular,29,jul,44,5,-1,0,unknown,0
9038,32,admin.,married,secondary,1604,no,cellular,7,jul,625,1,-1,0,unknown,1
9039,45,admin.,married,secondary,857,yes,cellular,21,nov,169,1,92,20,other,0
9040,40,admin.,married,secondary,153,yes,unknown,14,may,159,2,-1,0,unknown,0


In [210]:
df_test

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,40,blue-collar,married,secondary,580,yes,unknown,16,may,192,1,-1,0,unknown,0
1,47,services,single,secondary,3644,no,unknown,9,jun,83,2,-1,0,unknown,0
2,25,student,single,tertiary,538,yes,cellular,20,apr,226,1,-1,0,unknown,0
3,42,management,married,tertiary,1773,no,cellular,9,apr,311,1,336,1,failure,0
4,56,management,married,tertiary,217,no,cellular,21,jul,121,2,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9038,47,management,married,tertiary,1890,no,cellular,8,jul,161,1,-1,0,unknown,0
9039,32,blue-collar,single,secondary,217,yes,cellular,15,may,692,3,-1,0,unknown,1
9040,52,admin.,divorced,secondary,0,yes,unknown,26,may,206,1,-1,0,unknown,0
9041,40,admin.,divorced,secondary,783,yes,unknown,16,may,171,2,-1,0,unknown,0


In [211]:
#target variables
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [212]:
#delete the y column so you don't train with it
del df_train['y']
del df_val['y']
del df_test['y']

### **Mutual Information Score**

In [213]:
from sklearn.metrics import mutual_info_score

In [214]:
#wrap the mutual_info_score function so it can be used with all other categorical variables in the apply
#function
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [215]:
#categorical variables
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [216]:
#apply function - score is arranged in descending order
mutual_info = df_train[categorical].apply(mutual_info_churn_score)
round(mutual_info.sort_values(ascending=False), 2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

###### It seems the variable, poutcome (outcome of the previous marketing campaign), has the highest mutual information score, meaning we can learn a lot about a customer's decision to subscribe to a term deposit or not from it

### **One-hot Encoding**

In [217]:
from sklearn.feature_extraction import DictVectorizer

In [218]:
dv = DictVectorizer(sparse=False)

In [219]:
#turn training dataset to dictionary
train_dicts = df_train.to_dict(orient='records')

In [220]:
dv.fit(train_dicts)

In [221]:
X_train = dv.transform(train_dicts)

In [222]:
#turn val dataset to dictionary
val_dicts = df_val.to_dict(orient='records')

In [223]:
X_val = dv.transform(val_dicts)

###### Do not fit the validation dataset

In [224]:
#to get the names of the columns
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

### **Training Logistic Regression**

In [225]:
from sklearn.linear_model import LogisticRegression

In [226]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [227]:
model.fit(X_train, y_train)

In [228]:
#coefficients of the model
model.coef_

array([[-1.86055010e-03,  1.11908456e-05, -8.80159934e-02,
         2.54937547e-01,  7.09884337e-02, -1.23109262e+00,
         6.45914549e-03,  4.11392149e-03, -4.19356359e-01,
        -2.44806988e-01, -5.44543331e-02, -1.86548961e-01,
        -8.62495210e-02, -8.18917120e-01,  7.28809179e-02,
        -2.46324187e-01, -2.40413200e-01, -2.81031666e-01,
        -8.06777342e-02,  3.32635266e-01, -2.71977266e-01,
        -1.54219271e-01,  2.45104220e-01, -1.62825337e-01,
         1.65375604e-02, -1.34855943e-01, -2.97314991e-01,
        -4.52990640e-01, -1.54861010e-01,  7.32609466e-02,
        -6.69306538e-01,  3.20313874e-01, -3.12993776e-01,
        -9.49618470e-01, -9.49881798e-01,  2.62805685e-01,
         1.24916454e+00, -4.79553242e-01, -9.00533969e-01,
         7.37167041e-01,  7.14009068e-01, -8.27759887e-04,
        -7.62264121e-01, -5.41203256e-01,  1.49196689e+00,
        -1.09366615e+00,  6.08358647e-03]])

In [229]:
#to get intercept or bias (w0) - what we assume about the customer without knowing a thing about them
model.intercept_[0]

np.float64(-0.9051666407258808)

In [230]:
#getting predictions with the validation dataset
y_pred = model.predict(X_val)
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(9042,))

In [231]:
#accuracy score on validation set
from sklearn.metrics import accuracy_score

accuracy = round(accuracy_score(y_val, y_pred),2)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9


###### The above score implies that 90% of the predictions match

### **Least Useful Feature - Using Feature Elimination Technique**

In [232]:
original_accuracy_score = accuracy

#create a dataframe to display the features, the accuracy score without them, and the difference between
#the original accuracy score and the updated accuracy score without the feature
accuracy_scores = pd.DataFrame(columns=['feature', 'accuracy_without_feature', 'difference'])

listed_features = categorical + numerical

for feature in listed_features:

    #make a copy of the list of features
    sub_features = listed_features.copy()

    #remove a feature every time it loops
    sub_features.remove(feature)

    #train logistic regression without the removed feature
    train_dicts_feature_removed = df_train[sub_features].to_dict(orient='records')
    dv.fit(train_dicts_feature_removed)
    X_train_feature_removed = dv.transform(train_dicts_feature_removed)

    val_dicts_feature_removed = df_val[sub_features].to_dict(orient='records')
    X_val_feature_removed = dv.transform(val_dicts_feature_removed)

    model.fit(X_train_feature_removed, y_train)

    y_pred = model.predict(X_val_feature_removed)
    updated_accuracy_score = accuracy_score(y_val, y_pred)

    accuracy_scores.loc[len(accuracy_scores)] = [feature, updated_accuracy_score, original_accuracy_score - updated_accuracy_score]
    

In [233]:
accuracy_scores

Unnamed: 0,feature,accuracy_without_feature,difference
0,job,0.900796,-0.000796
1,marital,0.900133,-0.000133
2,education,0.901017,-0.001017
3,housing,0.90146,-0.00146
4,contact,0.900575,-0.000575
5,month,0.899801,0.000199
6,poutcome,0.893276,0.006724
7,age,0.901239,-0.001239
8,balance,0.901349,-0.001349
9,day,0.901017,-0.001017


In [234]:
#feature with smallest difference
min_diff_abs = accuracy_scores.difference.abs().min()
accuracy_scores[accuracy_scores.difference.abs() == min_diff_abs]

Unnamed: 0,feature,accuracy_without_feature,difference
1,marital,0.900133,-0.000133


###### It looks like the variable with the smallest difference is marital (the sign does not matter, it's more of the magnitude of the value). This implies that it is the least useful feature for the model as it's presence or absence does not change much

### **Training A Regularized Logistic Regression**

##### With C = 0.01

In [235]:
model = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred),3)
print(f'Accuracy: {accuracy}')

Accuracy: 0.898


##### With C = 0.1

In [236]:
model = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred),3)
print(f'Accuracy: {accuracy}')

Accuracy: 0.901


##### With C = 1

In [237]:
model = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred),3)
print(f'Accuracy: {accuracy}')

Accuracy: 0.902


##### With C = 10

In [238]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred),3)
print(f'Accuracy: {accuracy}')

Accuracy: 0.901


##### With C = 100

In [239]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

accuracy = round(accuracy_score(y_val, y_pred),3)
print(f'Accuracy: {accuracy}')

Accuracy: 0.901


###### It seems the best accuracy score on the validation set is with value of C = 1