In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('bank-full.csv',sep=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.columns = df.columns.str.lower().str.replace(' ' , '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
	df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
sc = ['age', 'job', 'marital', 'education', 'balance', 'housing',
       'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y']

df_new = df[sc].copy()

In [6]:
df_new.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [7]:
df_new.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
df.education.mode()

0    secondary
dtype: object

In [9]:
df_new.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [10]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [11]:
correlation_matrix = df_new.corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [12]:
unstacked = correlation_matrix.unstack()

unstacked = unstacked[unstacked != 1].drop_duplicates()

sorted_corr = unstacked.sort_values(ascending=False)

sorted_corr

pdays     previous    0.454820
day       campaign    0.162490
age       balance     0.097783
balance   duration    0.021560
          previous    0.016674
age       campaign    0.004760
balance   day         0.004503
          pdays       0.003435
age       previous    0.001288
duration  previous    0.001203
          pdays      -0.001565
age       duration   -0.004648
          day        -0.009120
balance   campaign   -0.014578
age       pdays      -0.023758
day       duration   -0.030206
campaign  previous   -0.032855
day       previous   -0.051710
duration  campaign   -0.084570
campaign  pdays      -0.088628
day       pdays      -0.093044
dtype: float64

In [13]:
df_new.y = (df_new.y == 'yes').astype(int)

In [14]:
df_full_train, df_test = train_test_split(df_new, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=len(df_test), random_state=42)

In [15]:
len(df_train), len(df_test), len(df_val)

(27125, 9043, 9043)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [18]:
del df_train['y']
del df_val['y']
del df_test['y']

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
def mutual_info_y_score(series):
	return mutual_info_score(series, df_full_train.y)

In [21]:
mi = round(df_full_train[categorical].apply(mutual_info_y_score),2)
mi.sort_values(ascending=False)

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

In [22]:
df_full_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
3344,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown,0
17965,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown,0
18299,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown,0
10221,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown,0
32192,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown,1


In [23]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [25]:
train_dicts[0]

{'job': 'entrepreneur',
 'marital': 'married',
 'education': 'secondary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'nov',
 'poutcome': 'unknown',
 'age': 38,
 'balance': 0,
 'day': 17,
 'duration': 258,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

In [26]:
dv = DictVectorizer(sparse=False)

In [27]:
X_train = dv.fit_transform(train_dicts)

In [28]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [29]:
X_val = dv.transform(val_dicts)

In [30]:
X_train.shape

(27125, 47)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [33]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [34]:
model.intercept_[0]

-0.8459388925628432

In [35]:
model.coef_[0].round(3)

array([-3.000e-03,  0.000e+00, -8.200e-02,  2.460e-01,  6.100e-02,
       -1.153e+00,  4.000e-03,  4.000e-03, -4.070e-01, -2.330e-01,
       -5.600e-02, -1.500e-01, -5.500e-02, -7.910e-01,  6.600e-02,
       -2.680e-01, -2.160e-01, -2.350e-01, -8.300e-02,  3.780e-01,
       -2.460e-01, -1.740e-01,  2.050e-01, -1.740e-01,  3.000e-03,
       -1.010e-01, -2.770e-01, -4.330e-01, -1.360e-01,  1.410e-01,
       -6.180e-01,  2.580e-01, -2.870e-01, -7.520e-01, -8.750e-01,
        2.120e-01,  1.041e+00, -4.530e-01, -8.200e-01,  6.760e-01,
        6.320e-01, -1.000e-03, -7.300e-01, -4.740e-01,  1.504e+00,
       -1.145e+00,  3.000e-03])

In [36]:
model.predict(X_train)

array([0, 0, 0, ..., 0, 1, 0])

In [37]:
y_pred = model.predict_proba(X_val)[:,1]

In [38]:
y_pred

array([0.01231358, 0.01103175, 0.14008586, ..., 0.01012541, 0.29476215,
       0.03294533])

In [39]:
deposit_decision = (y_pred >= 0.5)

In [40]:
(y_val == deposit_decision).mean()

0.9006966714585868

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
y_pred_baseline = model.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_pred_baseline)
print(f'Baseline Accuracy: {baseline_accuracy}')

Baseline Accuracy: 0.9006966714585868


In [43]:
accuracy_differences = {}
feature_names = dv.get_feature_names()

In [44]:
for feature in feature_names:
    feature_mask = [f != feature for f in feature_names]
    
    X_train_excluded = X_train[:, feature_mask]  # Using the mask to filter features
    X_val_excluded = X_val[:, feature_mask]
    
    model.fit(X_train_excluded, y_train)
    
    y_pred_excluded = model.predict(X_val_excluded)
    accuracy_excluded = accuracy_score(y_val, y_pred_excluded)
    
    accuracy_difference = np.abs(baseline_accuracy - accuracy_excluded)
    accuracy_differences[feature] = accuracy_difference

least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
print(f'Smallest difference: {least_useful_feature}')
print(f'Accuracy Difference: {accuracy_differences[least_useful_feature]}')

Smallest difference: balance
Accuracy Difference: 0.0


In [47]:
C_values = [0.01, 0.1, 1, 10, 100]

accuracy_results = {}

In [48]:
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred_val = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred_val)
    accuracy_results[C] = accuracy 

    print(f'Accuracy for C={C}: {accuracy}')

Accuracy for C=0.01: 0.898595598805706
Accuracy for C=0.1: 0.9011390025434037
Accuracy for C=1: 0.9006966714585868
Accuracy for C=10: 0.9011390025434037
Accuracy for C=100: 0.9010284197721995


In [49]:
best_C = max(accuracy_results, key=accuracy_results.get)
print(f'Best C value: {best_C} with accuracy: {accuracy_results[best_C]}')

Best C value: 0.1 with accuracy: 0.9011390025434037
