In [184]:
# Import libraries
import pandas as pd

In [185]:
# Read the csv file into a Dataframe object
df = pd.read_csv('bank-full.csv', delimiter=';')

In [186]:
# Show the firts rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [187]:
# Select a subset of features from the dataset
columns = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [188]:
# There are no missing values
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [189]:
# Most frequent observation (mode) for the column education
df.education.value_counts()
print('Most frequent observation (mode) for the column education: ', df.education.mode()[0])

Most frequent observation (mode) for the column education:  secondary


In [190]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [191]:
# Select numerical features
numerical = df.columns[df.dtypes == 'int64'].tolist()
numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [192]:
# Correlation matrix for the numerical features
Corr_Matrix = round(df[numerical].corr(),2)
print(Corr_Matrix)

           age  balance   day  duration  campaign  pdays  previous
age       1.00     0.10 -0.01     -0.00      0.00  -0.02      0.00
balance   0.10     1.00  0.00      0.02     -0.01   0.00      0.02
day      -0.01     0.00  1.00     -0.03      0.16  -0.09     -0.05
duration -0.00     0.02 -0.03      1.00     -0.08  -0.00      0.00
campaign  0.00    -0.01  0.16     -0.08      1.00  -0.09     -0.03
pdays    -0.02     0.00 -0.09     -0.00     -0.09   1.00      0.45
previous  0.00     0.02 -0.05      0.00     -0.03   0.45      1.00


In [193]:
# Encode the y variable
df.y = (df.y == 'yes').astype(int)
df.y.value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [194]:
from sklearn.model_selection import train_test_split

In [195]:
# Split the data in train/val/test sets with 60%/20%/20% distribution
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [196]:
y_train = df_train.y.values
y_val = df_val.y.values
del df_train['y']
del df_val['y']

In [197]:
# Select categorical features
categorical = df.columns[df.dtypes == 'object'].tolist()
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [198]:
from sklearn.metrics import mutual_info_score

In [199]:
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.y)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi


Unnamed: 0,MI
poutcome,0.029257
month,0.024774
contact,0.014164
housing,0.0098
job,0.007765
education,0.002458
marital,0.002019


In [200]:
from sklearn.feature_extraction import DictVectorizer

In [201]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [202]:
train_dict[0]

{'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown',
 'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

In [203]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [204]:
X_train = dv.transform(train_dict)

In [205]:
X_train.shape

(27126, 47)

In [206]:
dv.feature_names_

['age',
 'balance',
 'campaign',
 'contact=cellular',
 'contact=telephone',
 'contact=unknown',
 'day',
 'duration',
 'education=primary',
 'education=secondary',
 'education=tertiary',
 'education=unknown',
 'housing=no',
 'housing=yes',
 'job=admin.',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=feb',
 'month=jan',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'month=oct',
 'month=sep',
 'pdays',
 'poutcome=failure',
 'poutcome=other',
 'poutcome=success',
 'poutcome=unknown',
 'previous']

In [207]:
from sklearn.linear_model import LogisticRegression

In [208]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [209]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [210]:
model.predict_proba(X_val)

array([[0.98615867, 0.01384133],
       [0.99008584, 0.00991416],
       [0.84144429, 0.15855571],
       ...,
       [0.94843472, 0.05156528],
       [0.9908712 , 0.0091288 ],
       [0.72878105, 0.27121895]])

In [211]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01384133, 0.00991416, 0.15855571, ..., 0.05156528, 0.0091288 ,
       0.27121895])

In [212]:
positive = y_pred > 0.5

In [213]:
accuracy = (y_val == positive).mean()
print('accuracy: ', round(accuracy, 2))

accuracy:  0.9


In [214]:
df_train.columns.tolist()

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [215]:
# Identify the less important feature

features = df_train.columns.tolist()

rows=[]

for f in features:
    df_train_subset = df_train.copy()
    df_train_subset.drop(f, axis=1, inplace=True)
    
    train_dict_subset = df_train_subset.to_dict(orient='records')
    dv_subset = DictVectorizer(sparse=False)
    dv_subset.fit(train_dict_subset)
    X_train_subset = dv_subset.transform(train_dict_subset)
    
    df_val_subset = df_val.copy()
    df_val_subset.drop(f, axis=1, inplace=True)
    val_dict_subset = df_val_subset.to_dict(orient='records')
    X_val_subset = dv_subset.transform(val_dict_subset)
    
    model.fit(X_train_subset, y_train)
    
    y_pred = model.predict_proba(X_val_subset)[:, 1]
    positive = y_pred > 0.5
    accuracy_subset = (y_val == positive).mean()
    
    rows.append({"droped_feature": f, "accuracy": accuracy_subset, "diff": abs(accuracy_subset-accuracy)})
    
print('Original accuracy: ', accuracy)
print()
    
df_results = pd.DataFrame(rows)
df_results.sort_values(by="diff", ascending=True, inplace=True)
df_results.reset_index(drop=True, inplace=True)
df_results


Original accuracy:  0.9010174740101747



Unnamed: 0,droped_feature,accuracy,diff
0,education,0.901017,0.0
1,marital,0.901128,0.000111
2,campaign,0.900907,0.000111
3,age,0.900796,0.000221
4,balance,0.900796,0.000221
5,day,0.901239,0.000221
6,contact,0.900686,0.000332
7,pdays,0.900686,0.000332
8,previous,0.900575,0.000442
9,job,0.90157,0.000553


In [216]:
# Identify the value of C that produces the best accuracy in the model

C_list= [0.01, 0.1, 1, 10, 100]

for C in C_list:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    model.predict_proba(X_val)
    y_pred = model.predict_proba(X_val)[:, 1]
    positive = y_pred > 0.5
    
    accuracy_reg = (y_val == positive).mean()
    
    print('accuracy for C=%s :' % C, round(accuracy_reg, 3))


accuracy for C=0.01 : 0.898
accuracy for C=0.1 : 0.901
accuracy for C=1 : 0.901
accuracy for C=10 : 0.901
accuracy for C=100 : 0.901
