In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [12]:
# download the dataset from Kaggle
!kaggle datasets download -d henriqueyamahata/bank-marketing -p ../Data

Dataset URL: https://www.kaggle.com/datasets/henriqueyamahata/bank-marketing
License(s): other
bank-marketing.zip: Skipping, found more recently modified local copy (use --force to force download)


In [13]:
# unzip the dataset
!unzip ../Data/bank-marketing.zip -d ../Data

Archive:  ../Data/bank-marketing.zip
replace ../Data/bank-additional-full.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ../Data/bank-additional-full.csv  
replace ../Data/bank-additional-names.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ../Data/bank-additional-names.txt  


In [78]:
bank_data = pd.read_csv('../Data/bank-additional-full.csv', sep=';')
bank_data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [79]:
# splitting the dataset
train_data, test_data = train_test_split(bank_data, test_size = 0.2)

In [80]:
# checking for null or '?' values in train data
print(train_data.isnull().sum())
print(train_data[train_data == '?'].count())

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [81]:
# checking for null or '?' values in test data
print(test_data.isnull().sum())
print(test_data[test_data == '?'].count())

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64


In [82]:
# encoding the categorical features
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

for column in categorical_columns:
    label_encoder = LabelEncoder()

    train_data.loc[:, column] = label_encoder.fit_transform(train_data[column])
    test_data.loc[:, column] = label_encoder.transform(test_data[column])

train_data.loc[:, 'y'] = train_data['y'].map({'no': 0, 'yes': 1})
test_data.loc[:, 'y'] = test_data['y'].map({'no': 0, 'yes': 1})

train_data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
6688,39,1,1,1,0,2,0,1,6,4,...,2,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0
4312,41,7,1,3,1,2,0,1,6,3,...,1,999,0,1,1.1,93.994,-36.4,4.856,5191.0,0
11256,36,1,2,2,0,0,2,1,4,2,...,1,999,0,1,1.4,94.465,-41.8,4.961,5228.1,0
1352,33,4,1,6,0,0,0,1,6,2,...,2,999,0,1,1.1,93.994,-36.4,4.855,5191.0,0
9091,47,2,1,7,1,0,0,1,4,0,...,2,999,0,1,1.4,94.465,-41.8,4.967,5228.1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40151,23,1,2,2,0,2,0,0,3,2,...,5,999,1,0,-1.7,94.215,-40.3,0.846,4991.6,1
33675,36,1,1,1,0,2,0,0,6,4,...,1,999,1,0,-1.8,92.893,-46.2,1.281,5099.1,0
13096,39,0,1,6,0,2,2,0,3,4,...,1,999,0,1,1.4,93.918,-42.7,4.962,5228.1,0
2529,58,5,1,5,1,1,1,1,6,3,...,1,999,0,1,1.1,93.994,-36.4,4.856,5191.0,0


In [83]:
# standardize features
std_scaler = StandardScaler()

y_train = train_data['y']
X_train = train_data.drop(columns = ['y'])

train_data_scaled = std_scaler.fit_transform(X_train)

train_data_scaled_df = pd.DataFrame(train_data_scaled, columns = X_train.columns)
train_data_scaled_df['y'] = y_train.values
train_data = train_data_scaled_df.copy()



y_test = test_data['y']
X_test = test_data.drop(columns = ['y'])

test_data_scaled = std_scaler.transform(X_test)

test_data_scaled_df = pd.DataFrame(test_data_scaled, columns = X_test.columns)
test_data_scaled_df['y'] = y_test.values
test_data = test_data_scaled_df.copy()

In [84]:
train_data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,-0.102916,-0.758230,-0.282105,-1.286632,-0.515462,0.941073,-0.452041,1.323641,0.764425,1.428849,...,-0.204897,0.193651,-0.351209,0.196736,0.647634,0.725469,0.884801,0.712293,0.330741,0
1,0.088861,0.910631,-0.282105,-0.349881,1.938588,0.941073,-0.452041,1.323641,0.764425,0.714392,...,-0.563459,0.193651,-0.351209,0.196736,0.647634,0.725469,0.884801,0.711716,0.330741,0
2,-0.390582,-0.758230,1.359314,-0.818257,-0.515462,-1.089067,2.314442,1.323641,-0.095183,-0.000065,...,-0.563459,0.193651,-0.351209,0.196736,0.838732,1.539940,-0.280460,0.772245,0.843743,0
3,-0.678247,0.076200,-0.282105,1.055245,-0.515462,-1.089067,-0.452041,1.323641,0.764425,-0.000065,...,-0.204897,0.193651,-0.351209,0.196736,0.647634,0.725469,0.884801,0.711140,0.330741,0
4,0.664191,-0.480087,-0.282105,1.523620,1.938588,-1.089067,-0.452041,1.323641,-0.095183,-1.428979,...,-0.204897,0.193651,-0.351209,0.196736,0.838732,1.539940,-0.280460,0.775704,0.843743,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32945,-1.637132,-0.758230,1.359314,-0.818257,-0.515462,0.941073,-0.452041,-0.755492,-0.524987,-0.000065,...,0.870789,0.193651,1.679027,-2.555900,-1.135953,1.107630,0.043224,-1.599916,-2.426472,1
32946,-0.390582,-0.758230,-0.282105,-1.286632,-0.515462,0.941073,-0.452041,-0.755492,0.764425,1.428849,...,-0.563459,0.193651,1.679027,-2.555900,-1.199653,-1.178424,-1.229931,-1.349153,-0.940011,0
32947,-0.102916,-1.036373,-0.282105,1.055245,-0.515462,0.941073,2.314442,-0.755492,-0.524987,1.428849,...,-0.563459,0.193651,-0.351209,0.196736,0.838732,0.594046,-0.474670,0.772822,0.843743,0
32948,1.718964,0.354344,-0.282105,0.586869,1.938588,-0.073997,0.931201,1.323641,0.764425,0.714392,...,-0.563459,0.193651,-0.351209,0.196736,0.647634,0.725469,0.884801,0.711716,0.330741,0


In [85]:
# splitting the dataset
X_train = train_data.drop(columns = ['y'])
X_test = test_data.drop(columns = ['y'])
y_train = train_data['y']
y_test = test_data['y']

In [86]:
# converting y_train and y_test to integers to ensure consistent numeric encoding
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [87]:
# training the model
base_estimator = DecisionTreeClassifier(max_depth = 3)
model = AdaBoostClassifier(
    estimator = base_estimator,
    n_estimators = 70,
    learning_rate = 1.0,
    random_state = 42
)
model.fit(X_train, y_train)



In [88]:
# predicting on the train and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [89]:
# model evaluation
score_1 = f1_score(y_train, y_train_pred)
print(f"f1 score on Training Data: {score_1}")

score_2 = f1_score(y_test, y_test_pred)
print(f"f1 score on Test Data: {score_2}")

f1 score on Training Data: 0.6552790457959063
f1 score on Test Data: 0.6042261564820103
