In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('bank-full.csv', sep=';')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [5]:
categorical = ['job','marital','education','housing','contact','month','poutcome']
numerical = ['age','balance','day','duration','campaign','pdays','previous']

In [6]:
df.columns = df.columns.str.lower().str.replace(' ','_')
for c in categorical:
    df[c] = df[c].str.lower().replace(' ','_')


In [7]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [8]:
df['age'].corr(df['balance'])

np.float64(0.09778273937134754)

In [9]:
df['day'].corr(df['campaign'])

np.float64(0.16249021632619293)

In [10]:
df['day'].corr(df['pdays'])

np.float64(-0.09304407377294052)

In [11]:
df['pdays'].corr(df['previous'])

np.float64(0.45481963548050075)

In [12]:
df['y'] = (df['y']=='yes').astype(int)
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [16]:
df_full_train.shape, df_train.shape, df_val.shape, df_test.shape

((36168, 17), (27126, 17), (9042, 17), (9043, 17))

In [17]:
y_train = df_train.y.values
y_test = df_test.y.values
y_val = df_val.y.values

y_train.shape, y_test.shape, y_val.shape

((27126,), (9043,), (9042,))

In [18]:
del df_train['y']
del df_test['y']
del df_val['y']

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
def mutual_score(series):
    return mutual_info_score(series, y_train)

In [21]:
mi = df_train[categorical].apply(mutual_score).round(2)
mi

job          0.01
marital      0.00
education    0.00
housing      0.01
contact      0.01
month        0.03
poutcome     0.03
dtype: float64

In [22]:
from sklearn.feature_extraction import DictVectorizer

In [23]:
dv = DictVectorizer(sparse = False)

In [24]:
train_dict = df_train[numerical + categorical].to_dict(orient='records')


In [25]:
X_train = dv.fit_transform(train_dict)
X_train.shape

(27126, 47)

In [26]:
val_dict = df_val[numerical + categorical].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)
X_val.shape

(9042, 47)

In [27]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)


In [28]:
model.fit(X_train, y_train)

In [29]:
model.intercept_[0]

np.float64(-0.976486527757112)

In [30]:
model.coef_[0].shape

(47,)

In [31]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.01240549, 0.01017637, 0.15515956, ..., 0.05676404, 0.00908912,
       0.28499536], shape=(9042,))

In [32]:
paid = (y_pred >= 0.5)

In [33]:
(y_val == paid).mean().round(2)

np.float64(0.9)

In [34]:
values = dict(zip(dv.get_feature_names_out(),model.coef_[0]))

In [46]:
small = ['balance', 'pdays','age','duration','previous','day']
dv_small = DictVectorizer(sparse = False)

In [47]:
small_dict = df_train[small].to_dict(orient='records')
small_dict_val = df_val[small].to_dict(orient='records')

In [48]:
X_train_small = dv_small.fit_transform(small_dict)

In [49]:
X_val_small = dv_small.fit_transform(small_dict_val)


In [52]:
small_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)


small_model.fit(X_train_small, y_train)

In [53]:
small_model.fit(X_train_small, y_train)

In [55]:
w0 = small_model.intercept_[0]
w0

np.float64(-3.6314650353476567)

In [65]:
y_pred_small = small_model.predict_proba(X_val_small)[:,1]
paid = (y_pred_small >= 0.5).mean()

In [66]:
(y_val == paid).mean()

np.float64(0.0)