In [1]:
import numpy as np
import pandas as pd

In [2]:
# Download data (manually extracted relevant data file)
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

## Data preparation

In [3]:
df = pd.read_csv('../data/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Select the relevant subset of features from the full DataFrame
features_to_select = [
    'age', 'job', 'marital', 'education', 'balance',
    'housing', 'contact', 'day', 'month', 'duration',
    'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

df_sub = df[features_to_select]
df_sub.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
# Check if missing values are present in the DataFrame
df_sub.isnull().sum() # no missing values

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Questions 1 and 2

In [6]:
# Question 1: What is the most frequent observation (mode) for the column education?

# Options:
# unknown
# primary
# secondary
# tertiary

df_sub['education'].value_counts().sort_values(ascending=False).index[0]

'secondary'

In [7]:
# Question 2: Create the correlation matrix for the numerical features of your dataset.
# In a correlation matrix, you compute the correlation coefficient between every pair
# of features.

# What are the two features that have the biggest correlation?

# age and balance
# day and campaign
# day and pdays
# pdays and previous

numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
df_num = df_sub[numerical]
df_num.corr() # pdays and previous

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


## More data preparation

In [8]:
# Target encoding: T/F => 1/0
df_sub['y'] = (df_sub.y == 'yes').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['y'] = (df_sub.y == 'yes').astype(int)


In [9]:
# Split the data into train/val/test sets (60-20-20 split)
from sklearn.model_selection import train_test_split

# Perform the split according to the chosen ratio
df_full_train, df_test = train_test_split(df_sub, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_full_train, test_size=(20/80), random_state=42)

# Create target vector for each split
y_train = df_train['y'].values
y_valid = df_valid['y'].values
y_test = df_valid['y'].values

# Remove the target variable from the input DataFrames in order to prevent data leakage
del df_train['y']
del df_valid['y']
del df_test['y']

In [10]:
df_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
20326,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
24301,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
38618,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
18909,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
23081,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [11]:
df_train.shape, y_train.shape

((27126, 14), (27126,))

## Questions 3 - 6

In [12]:
# Question 3: Calculate the mutual information score between y and other categorical
# variables in the dataset. Use the training set only. Round the scores to 2 decimals
# using round(score, 2). Which of these variables has the biggest mutual information score?

# contact
# education
# housing
# poutcome

from sklearn.metrics import mutual_info_score

# Categorical variables
categorical = [col for col in list(df_train.columns) if col not in numerical]

# Function to compute mutual information score between any column (Series) and target variable y
def mutual_info_y_score(col):
    return mutual_info_score(col, pd.Series(y_train))

# Apply the function to categorical DataFrame to obtain mutual info scores between each col and y
df_train[categorical].apply(mutual_info_y_score).sort_values(ascending=False).apply(lambda x : round(x, 2)) # poutcome

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

In [13]:
# Question 4
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
# Fit the model on the training dataset.
# To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
# What accuracy did you get?

# 0.6
# 0.7
# 0.8
# 0.9

from sklearn.feature_extraction import DictVectorizer

# Convert train, valid, test DataFrames, each to a list of dictionaries
train_dicts = df_train.to_dict(orient='records')
valid_dicts = df_valid.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

# Initialize and fit/transform DictVectorizer on train_dicts. Then, transform valid_dicts and test_dicts.
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_valid = dv.transform(valid_dicts)
X_test = dv.transform(test_dicts)

In [14]:
from sklearn.linear_model import LogisticRegression

# Initialize LogReg model and fit on train data
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict X_valid, using the default threshold of 0.5 (could also do this via model.predict(X_valid))
y_pred = model.predict_proba(X_valid)[:, 1] # p(yi = 1 | xi)

# Compute validation accuracy
valid_acc = ((y_pred > 0.5) == y_valid).mean()
valid_acc.round(2)

0.9

In [15]:
# Question 5
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
# Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
# Which of following feature has the smallest difference?

# age
# balance
# marital
# previous

# Note: The difference doesn't have to be positive.

# Full set of features we want to consider for this problem
feature_set = ['age', 'balance', 'marital', 'previous']

# Same process as above: create train/valid/test feature matrices with one-hot encoded features,
# train LogReg model with same parameters, and evaluate using accuracy on validation set
train_dicts = df_train[feature_set].to_dict(orient='records')
valid_dicts = df_valid[feature_set].to_dict(orient='records')
test_dicts = df_test[feature_set].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_valid = dv.transform(valid_dicts)
X_test = dv.transform(test_dicts)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
full_valid_acc = (model.predict(X_valid) == y_valid).mean()
full_valid_acc

0.880336208803362

In [16]:
# Dictionary to store differences in validation accuracies between model using all features
# and model using a subset of features excluding a single feature
valid_acc_diffs = {}

# Now, we do the same process for subsets of the original set of features, where we remove each
# feature in turn and train a model, recording the validation accuracy
for i in range(len(feature_set)):
    cur_feat = feature_set[i]
    feature_subset = feature_set[:i] + feature_set[i+1:]
    
    train_dicts = df_train[feature_subset].to_dict(orient='records')
    valid_dicts = df_valid[feature_subset].to_dict(orient='records')
    test_dicts = df_test[feature_subset].to_dict(orient='records')
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_valid = dv.transform(valid_dicts)
    X_test = dv.transform(test_dicts)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    valid_acc = (model.predict(X_valid) == y_valid).mean()
    valid_acc_diffs['without_%s' % cur_feat] = (full_valid_acc - valid_acc).round(4)

In [17]:
valid_acc_diffs # balance

{'without_age': -0.0001,
 'without_balance': 0.0,
 'without_marital': 0.0001,
 'without_previous': -0.0013}

In [18]:
# Question 6
# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?

# 0.01
# 0.1
# 1
# 10
# 100

# Note: If there are multiple options, select the smallest C.

# Prepare the data including all features (as in Q4)
train_dicts = df_train.to_dict(orient='records')
valid_dicts = df_valid.to_dict(orient='records')
test_dicts = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_valid = dv.transform(valid_dicts)
X_test = dv.transform(test_dicts)

# Values of C that we want to try
c_vals = [0.01, 0.1, 1, 10, 100]

# Dictionary to store validation accuracies for each setting of C
valid_accs = {}

for c in c_vals:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    valid_accs['C_%d' % c] = (model.predict(X_valid) == y_valid).mean()

In [19]:
valid_accs # C=100

{'C_0': 0.9009068790090687,
 'C_1': 0.9010174740101747,
 'C_10': 0.9010174740101747,
 'C_100': 0.9011280690112807}