# Homework Week 3: Classification

## 1. Libraries

In [448]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

## 2. Load dataset

In [449]:
usecols = ["age",
"job",
"marital",
"education",
"balance",
"housing",
"contact",
"day",
"month",
"duration",
"campaign",
"pdays",
"previous",
"poutcome",
"y"]

In [450]:
df = pd.read_csv("bank+marketing/bank/bank-full.csv", sep=";", usecols=usecols)
df.columns = df.columns.str.lower().str.replace(' ', '_')

df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [451]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


**Question 1
What is the most frequent observation (mode) for the column education?**

In [452]:
df.describe(include=["O"])

Unnamed: 0,job,marital,education,housing,contact,month,poutcome,y
count,45211,45211,45211,45211,45211,45211,45211,45211
unique,12,3,4,2,3,12,4,2
top,blue-collar,married,secondary,yes,cellular,may,unknown,no
freq,9732,27214,23202,25130,29285,13766,36959,39922


## 3. Exploratory data analysis (EDA)

In [453]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

print(num_cols)
print(cat_cols)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']


### 1. Number of missing values

In [454]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### 2. Correlation

**Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.
What are the two features that have the biggest correlation?**

In [455]:
# Fill diagonal and upper half with NaNs
corr = df[num_cols].corr()

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(color='#f1f1f1')  # Color NaNs grey
 .format(precision=2))

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,,,,,,,
balance,0.1,,,,,,
day,-0.01,0.0,,,,,
duration,-0.0,0.02,-0.03,,,,
campaign,0.0,-0.01,0.16,-0.08,,,
pdays,-0.02,0.0,-0.09,-0.0,-0.09,,
previous,0.0,0.02,-0.05,0.0,-0.03,0.45,


## 4. Data pre-processing

In [456]:
# Target encoding
df["y"] = (df["y"] == "yes").astype(int)

### 1. Splitting data into train, test, and validation

In [457]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train, df_val = train_test_split(df_train, test_size=0.25, random_state=42)

In [458]:
len(df_train), len(df_test), len(df_val)

(27126, 9043, 9042)

In [459]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [460]:
# Split between X (features) and y (target)

X_train = df_train.drop('y', axis=1)
y_train = df_train['y']

X_test = df_test.drop('y', axis=1)
y_test = df_test['y']

X_val = df_val.drop('y', axis=1)
y_val = df_val['y']

In [461]:
print(len(X_train)*100/len(df))
print(len(X_test)*100/len(df))
print(len(X_val)*100/len(df))

59.99867288934109
20.001769480878547
19.999557629780362


### 1. Mutual information

**Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?**

In [462]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

df_mi = X_train[cat_cols].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
poutcome,0.029533
month,0.02509
contact,0.013356
housing,0.010343
job,0.007316


Unnamed: 0,MI
contact,0.013356
housing,0.010343
job,0.007316
education,0.002697
marital,0.00205


### 2. One-hot encoding

In [463]:
train_dict = X_train[cat_cols + num_cols].to_dict(orient='records')

In [464]:
train_dict[0]

{'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown',
 'age': 32,
 'balance': 1100,
 'day': 11,
 'duration': 67,
 'campaign': 1,
 'pdays': -1,
 'previous': 0}

In [465]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [466]:
X_train = dv.transform(train_dict)

In [467]:
X_train.shape

(27126, 47)

In [468]:
dv.get_feature_names()



['age',
 'balance',
 'campaign',
 'contact=cellular',
 'contact=telephone',
 'contact=unknown',
 'day',
 'duration',
 'education=primary',
 'education=secondary',
 'education=tertiary',
 'education=unknown',
 'housing=no',
 'housing=yes',
 'job=admin.',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=feb',
 'month=jan',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'month=oct',
 'month=sep',
 'pdays',
 'poutcome=failure',
 'poutcome=other',
 'poutcome=success',
 'poutcome=unknown',
 'previous']

## 5. Training Model: Logistic regression

In [469]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [470]:
val_dict = X_val[cat_cols + num_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [471]:
model.predict_proba(X_val)

array([[0.9876019 , 0.0123981 ],
       [0.98988303, 0.01011697],
       [0.84530916, 0.15469084],
       ...,
       [0.94407911, 0.05592089],
       [0.99094851, 0.00905149],
       [0.71549047, 0.28450953]])

In [472]:
y_pred = model.predict_proba(X_val)[:, 1]

In [473]:
y_pred

array([0.0123981 , 0.01011697, 0.15469084, ..., 0.05592089, 0.00905149,
       0.28450953])

**Question 4
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.
What accuracy did you get?**

In [474]:
y_pred_decision = (y_pred >= 0.5).astype(int)

# Manually calculate the accuracy
correct_predictions = (y_pred_decision == y_val).mean()  # Count the number of correct predictions

baseline_accuracy = correct_predictions

correct_predictions

0.9007962840079629

## 6. Evaluation

In [475]:
dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))



{'age': 0.001,
 'balance': 0.0,
 'campaign': -0.078,
 'contact=cellular': 0.254,
 'contact=telephone': 0.081,
 'contact=unknown': -1.313,
 'day': 0.009,
 'duration': 0.004,
 'education=primary': -0.444,
 'education=secondary': -0.251,
 'education=tertiary': -0.055,
 'education=unknown': -0.228,
 'housing=no': -0.143,
 'housing=yes': -0.835,
 'job=admin.': 0.095,
 'job=blue-collar': -0.242,
 'job=entrepreneur': -0.264,
 'job=housemaid': -0.331,
 'job=management': -0.081,
 'job=retired': 0.271,
 'job=self-employed': -0.293,
 'job=services': -0.131,
 'job=student': 0.287,
 'job=technician': -0.148,
 'job=unemployed': 0.032,
 'job=unknown': -0.172,
 'marital=divorced': -0.35,
 'marital=married': -0.476,
 'marital=single': -0.151,
 'month=apr': -0.007,
 'month=aug': -0.713,
 'month=dec': 0.388,
 'month=feb': -0.328,
 'month=jan': -1.159,
 'month=jul': -1.043,
 'month=jun': 0.301,
 'month=mar': 1.45,
 'month=may': -0.501,
 'month=nov': -0.945,
 'month=oct': 0.785,
 'month=sep': 0.795,
 'pday

**Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?**

In [476]:
# List of features to test
features_to_test = ['age', 'balance', 'marital', 'previous']

# Store accuracy differences
accuracy_diff = {}

for feature in features_to_test:
    X_train_dropped = df_train.drop('y', axis=1)
    y_train_dropped = df_train['y']

    X_val_dropped = df_val.drop('y', axis=1)
    y_val_dropped = df_val['y']

    # Exclude the feature
    X_train_dropped = X_train_dropped.drop(columns=[feature])
    X_val_dropped = X_val_dropped.drop(columns=[feature])

    num_cols = X_train_dropped.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X_train_dropped.select_dtypes(exclude=np.number).columns.tolist()

    train_dict_dropped = X_train_dropped[num_cols + cat_cols].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict_dropped)
    X_train_dropped = dv.transform(train_dict_dropped)

    model_dropped = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_dropped.fit(X_train_dropped, y_train_dropped)

    val_dict = X_val_dropped.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    # Calculate accuracy
    model_dropped.predict_proba(X_val)
    y_pred = model_dropped.predict_proba(X_val)[:, 1]
    
    y_pred_decision = (y_pred >= 0.5).astype(int)

    # Manually calculate the accuracy
    dropped_accuracy = (y_pred_decision == y_val).mean()  # Count the number of correct predictions

    # Calculate accuracy difference
    diff = baseline_accuracy - dropped_accuracy
    accuracy_diff[feature] = diff
    print(f"Accuracy without {feature}: {dropped_accuracy}, Difference: {diff}")

Accuracy without age: 0.9010174740101747, Difference: -0.00022119000221187957
Accuracy without balance: 0.9007962840079629, Difference: 0.0
Accuracy without marital: 0.9014598540145985, Difference: -0.0006635700066356387
Accuracy without previous: 0.9014598540145985, Difference: -0.0006635700066356387


**Question 6
Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?**

In [477]:
# List of features to test
params_to_test = [0.01, 0.1, 1, 10, 100]

for param in params_to_test:
    X_train_dropped = df_train.drop('y', axis=1)
    y_train_dropped = df_train['y']

    X_val_dropped = df_val.drop('y', axis=1)
    y_val_dropped = df_val['y']

    num_cols = X_train_dropped.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X_train_dropped.select_dtypes(exclude=np.number).columns.tolist()

    train_dict_dropped = X_train_dropped[num_cols + cat_cols].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict_dropped)
    X_train_dropped = dv.transform(train_dict_dropped)

    model_dropped = LogisticRegression(solver='liblinear', C=param, max_iter=1000, random_state=42)
    model_dropped.fit(X_train_dropped, y_train_dropped)

    val_dict = X_val_dropped.to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    # Calculate accuracy
    model_dropped.predict_proba(X_val)
    y_pred = model_dropped.predict_proba(X_val)[:, 1]
    
    y_pred_decision = (y_pred >= 0.5).astype(int)

    # Manually calculate the accuracy
    param_accuracy = (y_pred_decision == y_val).mean()  # Count the number of correct predictions

    # Calculate accuracy difference
    print(f"Accuracy with C {param}: {np.round(param_accuracy, 3)}")
    # print(f"Accuracy with C {param}: {param_accuracy}")

Accuracy with C 0.01: 0.899
Accuracy with C 0.1: 0.9
Accuracy with C 1: 0.901
Accuracy with C 10: 0.901
Accuracy with C 100: 0.901
