In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv("/workspaces/ML-ZoomCamp2024-Homework/3-Classification/bank-full.csv",delimiter=';')
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
data[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']]
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## Check missing values

In [None]:
missing_values = data.isnull().sum()
print(missing_values)

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [None]:
data.mode()['education']

0    secondary
Name: education, dtype: object

Answer 2

In [6]:
data_corr = data[['age','day','balance','campaign','pdays','previous']]
data_corr.corr()

Unnamed: 0,age,day,balance,campaign,pdays,previous
age,1.0,-0.00912,0.097783,0.00476,-0.023758,0.001288
day,-0.00912,1.0,0.004503,0.16249,-0.093044,-0.05171
balance,0.097783,0.004503,1.0,-0.014578,0.003435,0.016674
campaign,0.00476,0.16249,-0.014578,1.0,-0.088628,-0.032855
pdays,-0.023758,-0.093044,0.003435,-0.088628,1.0,0.45482
previous,0.001288,-0.05171,0.016674,-0.032855,0.45482,1.0


The two that have the biggest correlation

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(data_corr, 2))

Top Absolute Correlations
pdays  previous    0.45482
day    campaign    0.16249
dtype: float64


Target Encoding

In [8]:
data['y'] = data['y'].map({'yes': 1, 'no': 0})
data['y'].unique()

array([0, 1])

Split the data

In [9]:
X = data[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome']]
y = data['y']

In [10]:
df_train, df_test, y_train, y_test = train_test_split(
  X,y , random_state=42,test_size=0.20, shuffle=True)

In [11]:
X_train, df_val,y_train_, y_val = train_test_split(df_train,y_train, test_size=0.20, random_state=42)


Answer 3

In [None]:
from sklearn.metrics import mutual_info_score
print("Contact and y: ",mutual_info_score(X_train.contact,y_train_))
print("education and y: ",mutual_info_score(X_train.education,y_train_))
print("Housing and y: ",mutual_info_score(X_train.housing,y_train_))
print("poutcome and y: ",mutual_info_score(X_train.poutcome,y_train_))

Contact and y:  0.013437033199463613
education and y:  0.002777096380458567
Housing and y:  0.01046512248172013
poutcome and y:  0.02938858721067336


Answer 4

In [None]:
from sklearn.feature_extraction import DictVectorizer
train_dict = df_train.to_dict(orient='records')
train_dict[0]


{'age': 41,
 'job': 'blue-collar',
 'marital': 'married',
 'education': 'primary',
 'balance': 849,
 'housing': 'yes',
 'contact': 'unknown',
 'day': 15,
 'month': 'may',
 'duration': 72,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [15]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [16]:
X_train = dv.transform(train_dict)
print(X_train.shape)
print(y_train.shape)


(36168, 47)
(36168,)


Training Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [18]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [19]:
model.predict_proba(X_val)


array([[0.98728821, 0.01271179],
       [0.99007258, 0.00992742],
       [0.85847345, 0.14152655],
       ...,
       [0.36309087, 0.63690913],
       [0.92942299, 0.07057701],
       [0.15199758, 0.84800242]])

In [20]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
y_pred

array([0.01271179, 0.00992742, 0.14152655, ..., 0.63690913, 0.07057701,
       0.84800242])

In [22]:
y =  y_pred > 0.5

Accuracy 

In [23]:
original_accuracy =(y_val == y).mean()
print(f"{original_accuracy:.2f}")

0.90


Answer 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [24]:
model.intercept_[0]


np.float64(-0.8860872904411391)

In [25]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'age': np.float64(-0.002),
 'balance': np.float64(0.0),
 'campaign': np.float64(-0.088),
 'contact=cellular': np.float64(0.221),
 'contact=telephone': np.float64(0.135),
 'contact=unknown': np.float64(-1.242),
 'day': np.float64(0.004),
 'duration': np.float64(0.004),
 'education=primary': np.float64(-0.423),
 'education=secondary': np.float64(-0.238),
 'education=tertiary': np.float64(-0.071),
 'education=unknown': np.float64(-0.154),
 'housing=no': np.float64(-0.088),
 'housing=yes': np.float64(-0.798),
 'job=admin.': np.float64(0.029),
 'job=blue-collar': np.float64(-0.305),
 'job=entrepreneur': np.float64(-0.245),
 'job=housemaid': np.float64(-0.24),
 'job=management': np.float64(-0.108),
 'job=retired': np.float64(0.382),
 'job=self-employed': np.float64(-0.171),
 'job=services': np.float64(-0.192),
 'job=student': np.float64(0.297),
 'job=technician': np.float64(-0.178),
 'job=unemployed': np.float64(-0.076),
 'job=unknown': np.float64(-0.079),
 'marital=divorced': np.float64(-0

In [26]:
df_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
3344,41,blue-collar,married,primary,849,yes,unknown,15,may,72,1,-1,0,unknown
17965,49,technician,married,primary,1415,yes,cellular,30,jul,269,2,-1,0,unknown
18299,42,admin.,married,secondary,3842,no,cellular,31,jul,130,4,-1,0,unknown
10221,37,management,single,tertiary,-119,yes,unknown,11,jun,375,11,-1,0,unknown
32192,56,blue-collar,married,primary,3498,no,cellular,15,apr,264,2,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,44,housemaid,single,primary,1059,no,unknown,18,jun,2093,1,-1,0,unknown
44732,23,student,single,tertiary,508,no,cellular,8,sep,210,1,92,1,failure
38158,34,technician,divorced,tertiary,1317,yes,cellular,15,may,239,1,-1,0,unknown
860,33,retired,married,secondary,165,no,unknown,7,may,111,1,-1,0,unknown


In [None]:

train_dict_small = df_train[['age','marital','previous', 'balance']].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)
print(X_small_train.shape)

dv_small.get_feature_names_out()

(36168, 6)


array(['age', 'balance', 'marital=divorced', 'marital=married',
       'marital=single', 'previous'], dtype=object)

In [28]:
model_small =  LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_small_train, y_train)

In [29]:
model_small.intercept_[0]

np.float64(-2.096767913566333)

In [30]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(2)))

{'age': np.float64(0.02),
 'balance': np.float64(0.0),
 'marital=divorced': np.float64(-0.77),
 'marital=married': np.float64(-0.95),
 'marital=single': np.float64(-0.37),
 'previous': np.float64(0.11)}

In [31]:
val_dict_small = df_val[['previous', 'balance']].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)

In [32]:
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]

In [33]:
y_pred_small

array([0.10937639, 0.11348465, 0.10993361, ..., 0.16216919, 0.12071967,
       0.18319379])

In [34]:
y_small =  y_pred_small > 0.5

In [35]:
small_accuracy = (y_val == y_small).mean()
print(f"{small_accuracy:.2f}")

0.88


In [36]:
accuracy_drops = {}
for i, feature_name in enumerate(dv_small.get_feature_names_out()):
    accuracy_drops[feature_name] = original_accuracy - small_accuracy
    print(f"Feature: {feature_name}, Accuracy without: {small_accuracy:.4f}, Drop: {accuracy_drops[feature_name]:.4f}")
print("\nAccuracy Drops for Each Feature:")
for feature, drop in accuracy_drops.items():
    print(f"{feature}: {drop:.4f}")



Feature: age, Accuracy without: 0.8806, Drop: 0.0206
Feature: balance, Accuracy without: 0.8806, Drop: 0.0206
Feature: marital=divorced, Accuracy without: 0.8806, Drop: 0.0206
Feature: marital=married, Accuracy without: 0.8806, Drop: 0.0206
Feature: marital=single, Accuracy without: 0.8806, Drop: 0.0206
Feature: previous, Accuracy without: 0.8806, Drop: 0.0206

Accuracy Drops for Each Feature:
age: 0.0206
balance: 0.0206
marital=divorced: 0.0206
marital=married: 0.0206
marital=single: 0.0206
previous: 0.0206


Now let's train a regularized logistic regression.
Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

In [39]:
from sklearn.metrics import accuracy_score

C_values = [0.01, 0.1, 1, 10, 100]
val_accuracies = {}

for C in C_values:
    model = LogisticRegression(C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_val_pred), 3)
    val_accuracies[C] = accuracy

    print(f"C = {C}, Validation Accuracy = {accuracy:.3f}")

print("\nValidation Accuracies for Different C Values:")
for C, acc in val_accuracies.items():
    print(f"C = {C}: {acc:.3f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 0.01, Validation Accuracy = 0.900


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 0.1, Validation Accuracy = 0.900


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 1, Validation Accuracy = 0.900


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


C = 10, Validation Accuracy = 0.901
C = 100, Validation Accuracy = 0.900

Validation Accuracies for Different C Values:
C = 0.01: 0.900
C = 0.1: 0.900
C = 1: 0.900
C = 10: 0.901
C = 100: 0.900


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
