In [13]:
import numpy as np
import pandas as pd
import zipfile
import os

In [72]:
# Download the dataset using wget (if not done already)
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

# Extract the zip file
# with zipfile.ZipFile('bank marketing.zip', 'r') as zip_ref:
#     zip_ref.extractall('bank_marketing')

# Load the dataset
data = pd.read_csv('bank/bank.csv', sep=';')
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [92]:
#data preparation

# Select required features

features = ['age', 'job', 'marital', 'education', 'balance', 'housing',
            'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 
            'previous', 'poutcome', 'y']

data = data[features]

# Convert specific columns to float if needed
numeric_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')


# Check for missing values
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])


Series([], dtype: int64)


In [93]:
#Question 1: Mode of Education
mode_education = data['education'].mode()[0]
print("Mode of Education:", mode_education)


Mode of Education: secondary


In [97]:
# # Question 2: Correlation Matrix

correlation_matrix = data[numeric_columns].corr()

print("\nCorrelation Matrix:")
print(correlation_matrix)



Correlation Matrix:
               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.083820 -0.017853 -0.002367 -0.005148 -0.008894 -0.003511
balance   0.083820  1.000000 -0.008677 -0.015950 -0.009976  0.009437  0.026196
day      -0.017853 -0.008677  1.000000 -0.024629  0.160706 -0.094352 -0.059114
duration -0.002367 -0.015950 -0.024629  1.000000 -0.068382  0.010380  0.018080
campaign -0.005148 -0.009976  0.160706 -0.068382  1.000000 -0.093137 -0.067833
pdays    -0.008894  0.009437 -0.094352  0.010380 -0.093137  1.000000  0.577562
previous -0.003511  0.026196 -0.059114  0.018080 -0.067833  0.577562  1.000000


In [99]:
# Target Encoding

data['y'] = data['y'].map({'yes': 1, 'no': 0})

In [100]:
from sklearn.model_selection import train_test_split

# Split the data
train, temp = train_test_split(data, test_size=0.4, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

X_train = train.drop('y', axis=1)
y_train = train['y']
X_val = val.drop('y', axis=1)
y_val = val['y']


In [103]:
# Question 3: Mutual Information Score

from sklearn.feature_selection import mutual_info_classif

# Encode categorical variables using one-hot encoding
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features='auto')
mi_scores = pd.Series(mi_scores, index=X_train_encoded.columns)
print(mi_scores.sort_values(ascending=False))
from sklearn.feature_selection import mutual_info_classif

# Encode categorical variables using one-hot encoding
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features='auto')
mi_scores = pd.Series(mi_scores, index=X_train_encoded.columns)
print(mi_scores.sort_values(ascending=False))


duration               0.071777
pdays                  0.034171
month_may              0.025759
poutcome_success       0.021165
poutcome_unknown       0.017296
previous               0.012718
job_services           0.012428
month_dec              0.010727
education_tertiary     0.009954
balance                0.008538
month_jun              0.007763
contact_unknown        0.007419
month_oct              0.006946
job_entrepreneur       0.006945
job_management         0.006261
day                    0.005796
age                    0.005668
job_student            0.005233
month_feb              0.005223
contact_telephone      0.004842
job_blue-collar        0.004693
job_retired            0.004621
education_secondary    0.004186
job_housemaid          0.004133
month_nov              0.003115
housing_yes            0.002402
job_unknown            0.001982
job_unemployed         0.001955
month_jan              0.001580
campaign               0.000000
job_self-employed      0.000000
job_tech

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

# Validate accuracy
X_val_encoded = pd.get_dummies(X_val, drop_first=True)
val_accuracy = accuracy_score(y_val, model.predict(X_val_encoded))
print("Validation Accuracy:", round(val_accuracy, 2))


Validation Accuracy: 0.9


In [105]:
#Question 5: Feature Elimination Technique
# Original accuracy
original_accuracy = val_accuracy

# Calculate accuracy for each feature removed
feature_differences = {}
for feature in X_train_encoded.columns:
    temp_features = X_train_encoded.drop(feature, axis=1)
    model.fit(temp_features, y_train)
    accuracy_without_feature = accuracy_score(y_val, model.predict(X_val_encoded.drop(feature, axis=1)))
    difference = original_accuracy - accuracy_without_feature
    feature_differences[feature] = difference

# Find the feature with the smallest difference
smallest_difference_feature = min(feature_differences, key=feature_differences.get)
print("Feature with smallest difference:", smallest_difference_feature)


Feature with smallest difference: housing_yes


In [107]:
#Regularized Logistic Regression
c_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for c in c_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)
    val_accuracy = accuracy_score(y_val, model.predict(X_val_encoded))
    accuracies[c] = round(val_accuracy, 3)

best_c = max(accuracies, key=accuracies.get)
print("Best C value:", best_c)


Best C value: 10
