In [3]:
import pandas as pd
import numpy as np
from rich.jupyter import print
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext rich

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


### Dataset
In this homework, we will use the Bank Marketing dataset. Download it from here.

Or you can do it with wget:

`wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip`

We need to take bank/bank-full.csv file from the downloaded zip-file.
In this dataset our desired target for classification task will be y variable - has the client subscribed a term deposit or not.

### Features


For the rest of the homework, you'll need to use only these columns:

- age,
- job,
- marital,
- education,
- balance,
- housing,
- contact,
- day,
- month,
- duration,
- campaign,
- pdays,
- previous,
- poutcome,
- y

In [4]:
df = pd.read_csv("data/bank_marketing/bank-full.csv", sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Data preparation
- Select only the features from above.
- Check if the missing values are presented in the features.


In [5]:
features = [
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]
df_new = df[features].copy()

In [6]:
df_new.isnull().sum()


age          [1;36m0[0m
job          [1;36m0[0m
marital      [1;36m0[0m
education    [1;36m0[0m
balance      [1;36m0[0m
housing      [1;36m0[0m
contact      [1;36m0[0m
day          [1;36m0[0m
month        [1;36m0[0m
duration     [1;36m0[0m
campaign     [1;36m0[0m
pdays        [1;36m0[0m
previous     [1;36m0[0m
poutcome     [1;36m0[0m
y            [1;36m0[0m
dtype: int64

In [7]:
df_new.dtypes


age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [8]:
categorical = list(df_new.dtypes[df_new.dtypes == "object"].index)
categorical.remove("y")
categorical

[1m[[0m[32m'job'[0m, [32m'marital'[0m, [32m'education'[0m, [32m'housing'[0m, [32m'contact'[0m, [32m'month'[0m, [32m'poutcome'[0m[1m][0m

### Question 1
#### What is the most frequent observation (mode) for the column education?

- unknown
- primary
- secondary
- tertiary

In [9]:
df_new.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [10]:
frequent_observation = df_new["education"].mode()[0]
print(
    f"The most frequent observation (mode) for the column education is: {frequent_observation}."
)

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`

In [11]:
numerical = list(set(features) - set(categorical))
numerical.remove("y")
numerical

[1m[[0m[32m'age'[0m, [32m'previous'[0m, [32m'campaign'[0m, [32m'pdays'[0m, [32m'duration'[0m, [32m'day'[0m, [32m'balance'[0m[1m][0m

In [12]:
correlation_matrix = df_new[numerical].corr()
correlation_matrix

Unnamed: 0,age,previous,campaign,pdays,duration,day,balance
age,1.0,0.001288,0.00476,-0.023758,-0.004648,-0.00912,0.097783
previous,0.001288,1.0,-0.032855,0.45482,0.001203,-0.05171,0.016674
campaign,0.00476,-0.032855,1.0,-0.088628,-0.08457,0.16249,-0.014578
pdays,-0.023758,0.45482,-0.088628,1.0,-0.001565,-0.093044,0.003435
duration,-0.004648,0.001203,-0.08457,-0.001565,1.0,-0.030206,0.02156
day,-0.00912,-0.05171,0.16249,-0.093044,-0.030206,1.0,0.004503
balance,0.097783,0.016674,-0.014578,0.003435,0.02156,0.004503,1.0


In [13]:
# correlation_between_two = correlation_matrix.unstack()
# correlation_between_two.sort_values()

In [14]:
corr_dict = {
    '["age"]["balance"]': correlation_matrix["age"]["balance"],
    '["day"]["campaign"],': correlation_matrix["day"]["campaign"],
    '["day"]["pdays"]': correlation_matrix["day"]["pdays"],
    '["pdays"]["previous"]': correlation_matrix["pdays"]["previous"],
}
max_corr = max(corr_dict, key=corr_dict.get)
print(
    f"The two features that have the biggest correlation are : {max_corr} with corr {correlation_matrix["pdays"]["previous"]}"
)


### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.



In [15]:
df_new.y


[1;36m0[0m         no
[1;36m1[0m         no
[1;36m2[0m         no
[1;36m3[0m         no
[1;36m4[0m         no
        [33m...[0m 
[1;36m45206[0m    yes
[1;36m45207[0m    yes
[1;36m45208[0m    yes
[1;36m45209[0m     no
[1;36m45210[0m     no
Name: y, Length: [1;36m45211[0m, dtype: object

In [16]:
df_new.y = (df_new.y == "yes").astype(int)
df_new.y


[1;36m0[0m        [1;36m0[0m
[1;36m1[0m        [1;36m0[0m
[1;36m2[0m        [1;36m0[0m
[1;36m3[0m        [1;36m0[0m
[1;36m4[0m        [1;36m0[0m
        ..
[1;36m45206[0m    [1;36m1[0m
[1;36m45207[0m    [1;36m1[0m
[1;36m45208[0m    [1;36m1[0m
[1;36m45209[0m    [1;36m0[0m
[1;36m45210[0m    [1;36m0[0m
Name: y, Length: [1;36m45211[0m, dtype: int64

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.

In [17]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df_new, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

[1m([0m[1;36m36168[0m, [1;36m9043[0m[1m)[0m

In [18]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

[1m([0m[1;36m27126[0m, [1;36m9042[0m, [1;36m9043[0m[1m)[0m

In [19]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [20]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [21]:
del df_train["y"]
del df_val["y"]
del df_test["y"]

In [22]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown,0
1,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown,0
2,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown,0
3,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown,0
4,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,44,housemaid,single,primary,1059,no,unknown,18,jun,2093,1,-1,0,unknown,1
36164,23,student,single,tertiary,508,no,cellular,8,sep,210,1,92,1,failure,0
36165,34,technician,divorced,tertiary,1317,yes,cellular,15,may,239,1,-1,0,unknown,0
36166,33,retired,married,secondary,165,no,unknown,7,may,111,1,-1,0,unknown,0


### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`


In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
def mutual_info_y_score(series):
    return mutual_info_score(labels_true=series, labels_pred=df_full_train.y)

In [25]:
mi = df_full_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending=False)


poutcome     [1;36m0.029257[0m
month        [1;36m0.024774[0m
contact      [1;36m0.014164[0m
housing      [1;36m0.009800[0m
job          [1;36m0.007765[0m
education    [1;36m0.002458[0m
marital      [1;36m0.002019[0m
dtype: float64

In [26]:
print(
    f"The variable that has the biggest mutual information score is 'poutcome', with value of {max(mi.sort_values(ascending=False))}' rounded to 2"
)

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [28]:
train_dict = df_train[categorical + numerical].to_dict(orient="records")
train_dict[0]


[1m{[0m
    [32m'job'[0m: [32m'technician'[0m,
    [32m'marital'[0m: [32m'single'[0m,
    [32m'education'[0m: [32m'tertiary'[0m,
    [32m'housing'[0m: [32m'yes'[0m,
    [32m'contact'[0m: [32m'cellular'[0m,
    [32m'month'[0m: [32m'aug'[0m,
    [32m'poutcome'[0m: [32m'unknown'[0m,
    [32m'age'[0m: [1;36m32[0m,
    [32m'previous'[0m: [1;36m0[0m,
    [32m'campaign'[0m: [1;36m1[0m,
    [32m'pdays'[0m: [1;36m-1[0m,
    [32m'duration'[0m: [1;36m67[0m,
    [32m'day'[0m: [1;36m11[0m,
    [32m'balance'[0m: [1;36m1100[0m
[1m}[0m

In [29]:
dv = DictVectorizer(sparse=False)

In [30]:
X_train = dv.fit_transform(train_dict)
X_train.shape

[1m([0m[1;36m27126[0m, [1;36m47[0m[1m)[0m

In [31]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [32]:
val_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)
X_val.shape

[1m([0m[1;36m9042[0m, [1;36m47[0m[1m)[0m

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred


[1;35marray[0m[1m([0m[1m[[0m[1;36m0.01294119[0m, [1;36m0.00972959[0m, [1;36m0.15344079[0m, [33m...[0m, [1;36m0.05244611[0m, [1;36m0.00909062[0m,
       [1;36m0.28165388[0m[1m][0m[1m)[0m

In [34]:
decision = (y_pred >= 0.5).astype(int)
decision

[1;35marray[0m[1m([0m[1m[[0m[1;36m0[0m, [1;36m0[0m, [1;36m0[0m, [33m...[0m, [1;36m0[0m, [1;36m0[0m, [1;36m0[0m[1m][0m[1m)[0m

In [35]:
accuracy = (y_val == decision).mean()
accuracy

[1;35mnp.float64[0m[1m([0m[1;36m0.9007962840079629[0m[1m)[0m

In [36]:
print(f"Accuracy is : {round(accuracy,2)}")

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.



In [37]:
df_train[categorical + numerical].copy()

Unnamed: 0,job,marital,education,housing,contact,month,poutcome,age,previous,campaign,pdays,duration,day,balance
0,technician,single,tertiary,yes,cellular,aug,unknown,32,0,1,-1,67,11,1100
1,entrepreneur,married,secondary,yes,cellular,nov,unknown,38,0,1,-1,258,17,0
2,blue-collar,married,secondary,yes,cellular,may,unknown,49,0,2,-1,349,15,3309
3,housemaid,married,primary,no,cellular,aug,unknown,37,0,1,-1,315,4,2410
4,self-employed,married,tertiary,no,cellular,aug,unknown,31,0,4,-1,74,26,3220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27121,services,single,secondary,no,cellular,jul,unknown,27,0,2,-1,606,8,167
27122,technician,single,tertiary,no,cellular,jan,unknown,40,0,1,-1,427,30,693
27123,technician,divorced,secondary,yes,unknown,may,unknown,54,0,1,-1,161,16,0
27124,services,single,secondary,no,cellular,jul,unknown,25,0,2,-1,1105,21,2311


In [38]:
accuracy_difference_list = []
columns = ["age", "balance", "marital", "previous"]
for col in columns:
    df_train_drop = df_train[categorical + numerical].copy()
    df_train_drop = df_train_drop.drop(col, axis=1)
    train_dict_drop = df_train_drop.to_dict(orient="records")
    dv = DictVectorizer(sparse=False)
    X_train_drop = dv.fit_transform(train_dict_drop)

    model = LogisticRegression(
        solver="liblinear",
        C=1.0,
        max_iter=1000,
        random_state=42,
    )
    model.fit(X_train_drop, y_train)

    df_val_drop = df_val.copy()
    df_val_drop = df_val_drop.drop(col, axis=1)
    val_dict_drop = df_val_drop.to_dict(orient="records")
    X_val_drop = dv.transform(val_dict_drop)

    y_pred_drop = model.predict_proba(X_val_drop)[:, 1]
    decision_drop = (y_pred_drop >= 0.5).astype(int)

    accuracy_drop = (decision_drop == y_val).mean()
    print(accuracy_drop)
    accuracy_difference_list.append(abs((accuracy) - (accuracy_drop)))

In [39]:
accuracy_difference_list


[1m[[0m
    [1;35mnp.float64[0m[1m([0m[1;36m0.00022119000221187957[0m[1m)[0m,
    [1;35mnp.float64[0m[1m([0m[1;36m0.00022119000221187957[0m[1m)[0m,
    [1;35mnp.float64[0m[1m([0m[1;36m0.00022119000221187957[0m[1m)[0m,
    [1;35mnp.float64[0m[1m([0m[1;36m0.0005529750055297544[0m[1m)[0m
[1m][0m

### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

## Submit the results

* Submit your results here: https://courses.datatalks.club/ml-zoomcamp-2024/homework/hw03
* If your answer doesn't match options exactly, select the closest one

In [40]:
C = [0.01, 0.1, 1, 10, 100]

dv = DictVectorizer(sparse=False)

df_train = df_train[categorical + numerical].copy()
train_dict_reg = df_train.to_dict(orient="records")
X_train_reg = dv.fit_transform(train_dict_reg)

df_val_reg = df_val[categorical + numerical].copy()
val_dicts_reg = df_val_reg.to_dict(orient="records")
X_val_reg = dv.transform(val_dicts_reg)

accuracy_reg_list = []
for c in C:
    model = LogisticRegression(
        solver="liblinear",
        C=c,
        max_iter=1000,
        random_state=42,
    )
    model.fit(X_train_reg, y_train)

    y_pred_reg = model.predict_proba(X_val_reg)[:, 1]
    decision = (y_pred_reg > 0.5).astype(int)

    accuracy = (decision == y_val).mean()
    accuracy_reg_list.append(round(accuracy, 3))

In [41]:
max(accuracy_reg_list)

[1;35mnp.float64[0m[1m([0m[1;36m0.901[0m[1m)[0m