In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_set=pd.read_csv("bank-additional.csv",sep=";")
data_set["y"]=(data_set["y"]=="yes").astype(int)


In [3]:
#Create Bin features
month={"jan":1, "feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,
      "aug":8,"sep":9,"oct":10,"nov":11,"dec":12}

data_set["month_num"]=data_set["month"].map(month)

#Season features
data_set["winter"] = data_set["month_num"].isin([12,1,2]).astype(int)
data_set["spring"] = data_set["month_num"].isin([3,4,5]).astype(int)
data_set["summer"] = data_set["month_num"].isin([6,7,8]).astype(int)
data_set["autumn"] = data_set["month_num"].isin([9,10,11]).astype(int)

#day features
data_set["weekend"] = data_set["day_of_week"].isin(["sat","sun"]).astype(int)



Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,cons.conf.idx,euribor3m,nr.employed,y,month_num,winter,spring,summer,autumn,weekend
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,-46.2,1.313,5099.1,0,5,0,1,0,0,0
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,-36.4,4.855,5191.0,0,5,0,1,0,0,0
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,-41.8,4.962,5228.1,0,6,0,0,1,0,0
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,-41.8,4.959,5228.1,0,6,0,0,1,0,0
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,-42.0,4.191,5195.8,0,11,0,0,0,1,0


In [4]:
#Categorical features
categorical_features=["job","marital","education","default","housing",
                      "loan","contact","month","day_of_week","poutcome"]

#encoding
le=LabelEncoder()
for col in categorical_features:
  data_set[f'{col}_encoded']=le.fit_transform(data_set[col])


In [5]:
#Binned features
data_set["age_bin"]=pd.qcut(data_set["age"],q=5, labels=["very_young","young","middle","senior","elderly"])
data_set["duration_bin"] = pd.qcut(data_set["duration"],q=5, labels=["very_short", "short","medium","long","very_long"])
data_set["campaign_bin"] = pd.cut(data_set["campaign"],bins=[0,2,4,6,8, np.inf],
                                  labels=["very_few","few","medium","many","very_many"])


In [6]:
#Numeric columns
numeric_features=["age","duration","campaign","pdays","previous",
                  "emp.var.rate", "cons.price.idx","cons.conf.idx",
                  "euribor3m","nr.employed"]
original_data = data_set[numeric_features].copy()
for column in numeric_features:
  Q1=data_set[column].quantile(0.25)
  Q3=data_set[column].quantile(0.75)
  IQR=Q3-Q1
  lower_bound= Q1-3*IQR
  upper_bound= Q3+3*IQR
  data_set[column] = data_set[column].clip(lower=lower_bound, upper=upper_bound)

In [7]:
#aggregate features
data_set["total_contacts"]=data_set["campaign"] + data_set["previous"]
data_set["contact_rate"]  =data_set["campaign"]/(data_set["previous"]+1)
data_set["economic_sentiment"]= data_set["emp.var.rate"] *data_set["cons.conf.idx"]
data_set["market_conditions"]=data_set["cons.price.idx"]*data_set["euribor3m"]

In [8]:
#interaction features
data_set["age_by_job"] = data_set["age"] * data_set["job_encoded"]
data_set["age_by_education"] = data_set["age"] * data_set["education_encoded"]
data_set["duration_by_contact"] = data_set["duration"] * data_set["contact_encoded"]
data_set["duration_by_month"] = data_set["duration"] * data_set["month_num"]
data_set["campaign_by_previous"] = data_set["campaign"] * data_set["previous"]
data_set["campaign_by_outcome"] = data_set["campaign"] * data_set["poutcome_encoded"]
data_set["emp_rate_by_euribor"] = data_set["emp.var.rate"] * data_set["euribor3m"]
data_set["price_by_confidence"] = data_set["cons.price.idx"] * data_set["cons.conf.idx"]

In [9]:
#Encode bin columns
for col in ["age_bin","duration_bin","campaign_bin"]:
  data_set[f'{col}_encoded']= le.fit_transform(data_set[col])

features_to_drop= categorical_features + ["age_bin","duration_bin","campaign_bin"]
data_set=data_set.drop(features_to_drop,axis=1)

In [10]:
#features and target
X= data_set.drop("y",axis=1)
y= data_set["y"]

#Remove correlated features
corr_matrix= X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
to_drop=[column for column in upper.columns if any(upper[column]>0.85)]
X=X.drop(to_drop, axis=1)


In [12]:
#feature selection using random forest
sel_model = RandomForestClassifier(n_estimators=200, random_state=42)
sel_model.fit(X,y)
selector= SelectFromModel(sel_model, prefit=True, threshold="median")
feature_idx=selector.get_support()
feature_names=X.columns[feature_idx]
X= X[feature_names]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

scaler=RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [16]:
#Training the model
param_grid={
    "n_estimators":[300],
    "max_depth" :[10,15,20,25,None],
    "min_samples_split": [2,5,10],
    "min_samples_leaf":[1,2,4],
    "max_features":["sqrt","log2"],
    "class_weight":["balanced","balanced_subsample"],
    "criterion" : ["gini","entropy"],
    "bootstrap": [True],
    "max_samples":[0.7,0.8,0.9]
}

base_rf= RandomForestClassifier(random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_random=RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_grid,
    n_iter=100,
    cv=cv,
    scoring="f1",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [17]:
# Make predictions and evaluate
best_rf = rf_random.best_estimator_
y_pred = best_rf.predict(X_test_scaled)
y_pred_proba = best_rf.predict_proba(X_test_scaled)[:, 1]

# Print results
print("\nBest Parameters:")
print(rf_random.best_params_)

print("\nFeature Importances:")
importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)
print(importances.head(10))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_test, y_pred_proba))


Best Parameters:
{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_samples': 0.8, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'entropy', 'class_weight': 'balanced', 'bootstrap': True}

Feature Importances:
                 feature  importance
1               duration    0.291922
3           emp.var.rate    0.118869
14  duration_bin_encoded    0.100467
5          cons.conf.idx    0.071730
12   duration_by_contact    0.062863
4         cons.price.idx    0.054641
0                    age    0.054418
11      poutcome_encoded    0.043881
7            job_encoded    0.034478
6              month_num    0.033158

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       734
           1       0.60      0.70      0.65        90

    accuracy                           0.92       824
   macro avg       0.78      0.82      0.80       824
weighted avg       0.92      0.92      0.92       824



In [18]:
train_accuracy = best_rf.score(X_train_scaled, y_train)
print(f"Training Accuracy: {train_accuracy:.5f}")

# Check testing accuracy
test_accuracy = best_rf.score(X_test_scaled, y_test)
print(f"Testing Accuracy: {test_accuracy:.5f}")

Training Accuracy: 0.95205
Testing Accuracy: 0.91626
