In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, RobustScaler

In [None]:
data_set=pd.read_csv("bank-additional.csv",sep=";")
data_set["y"]=(data_set["y"]=="yes").astype(int)


In [None]:
#Create Bin features
month={"jan":1, "feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,
      "aug":8,"sep":9,"oct":10,"nov":11,"dec":12}

data_set["month_num"]=data_set["month"].map(month)

#Season features
data_set["winter"] = data_set["month_num"].isin([12,1,2]).astype(int)
data_set["spring"] = data_set["month_num"].isin([3,4,5]).astype(int)
data_set["summer"] = data_set["month_num"].isin([6,7,8]).astype(int)
data_set["autumn"] = data_set["month_num"].isin([9,10,11]).astype(int)

#day features
data_set["weekend"] = data_set["day_of_week"].isin(["sat","sun"]).astype(int)

data_set.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,cons.conf.idx,euribor3m,nr.employed,y,month_num,winter,spring,summer,autumn,weekend
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,-46.2,1.313,5099.1,0,5,0,1,0,0,0
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,-36.4,4.855,5191.0,0,5,0,1,0,0,0
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,-41.8,4.962,5228.1,0,6,0,0,1,0,0
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,-41.8,4.959,5228.1,0,6,0,0,1,0,0
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,-42.0,4.191,5195.8,0,11,0,0,0,1,0


In [None]:
#Categorical features
categorical_features=["job","marital","education","default","housing",
                      "loan","contact","month","day_of_week","poutcome"]

#encoding
le=LabelEncoder()
for col in categorical_features:
  data_set[f'{col}_encoded']=le.fit_transform(data_set[col])


In [None]:
#Binned features
data_set["age_bin"]=pd.qcut(data_set["age"],q=5, labels=["very_young","young","middle","senior","elderly"])
data_set["duration_bin"] = pd.qcut(data_set["duration"],q=5, labels=["very_short", "short","medium","long","very_long"])
data_set["campaign_bin"] = pd.cut(data_set["campaign"],bins=[0,2,4,6,8, np.inf],
                                  labels=["very_few","few","medium","many","very_many"])


In [None]:
#Numeric columns
numeric_features=["age","duration","campaign","pdays","previous",
                  "emp.var.rate", "cons.price.idx","cons.conf.idx",
                  "euribor3m","nr.employed"]
original_data = data_set[numeric_features].copy()
for column in numeric_features:
  Q1=data_set[column].quantile(0.25)
  Q3=data_set[column].quantile(0.75)
  IQR=Q3-Q1
  lower_bound= Q1-3*IQR
  upper_bound= Q3+3*IQR
  data_set[column] = data_set[column].clip(lower=lower_bound, upper=upper_bound)

In [None]:
#aggregate features
data_set["total_contacts"]=data_set["campaign"] + data_set["previous"]
data_set["contact_rate"]  =data_set["campaign"]/(data_set["previous"]+1)
data_set["economic_sentiment"]= data_set["emp.var.rate"] *data_set["cons.conf.idx"]
data_set["market_conditions"]=data_set["cons.price.idx"]*data_set["euribor3m"]

In [None]:
#interaction features
data_set["age_by_job"] = data_set["age"] * data_set["job_encoded"]
data_set["age_by_education"] = data_set["age"] * data_set["education_encoded"]
data_set["duration_by_contact"] = data_set["duration"] * data_set["contact_encoded"]
data_set["duration_by_month"] = data_set["duration"] * data_set["month_num"]
data_set["campaign_by_previous"] = data_set["campaign"] * data_set["previous"]
data_set["campaign_by_outcome"] = data_set["campaign"] * data_set["poutcome_encoded"]
data_set["emp_rate_by_euribor"] = data_set["emp.var.rate"] * data_set["euribor3m"]
data_set["price_by_confidence"] = data_set["cons.price.idx"] * data_set["cons.conf.idx"]

In [None]:
#Encode bin columns
for col in ["age_bin","duration_bin","campaign_bin"]:
  data_set[f'{col}_encoded']= le.fit_transform(data_set[col])

features_to_drop= categorical_features + ["age_bin","duration_bin","campaign_bin"]
data_set=data_set.drop(features_to_drop,axis=1)

In [None]:
#features and target
X= data_set.drop("y",axis=1)
y= data_set["y"]

#Remove correlated features
corr_matrix= X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
to_drop=[column for column in upper.columns if any(upper[column]>0.85)]
X=X.drop(to_drop, axis=1)


In [None]:
#feature selection using random forest
sel_model = RandomForestClassifier(n_estimators=200, random_state=42)
sel_model.fit(X,y)
selector= SelectFromModel(sel_model, prefit=True, threshold="median")
feature_idx=selector.get_support()
feature_names=X.columns[feature_idx]
X= X[feature_names]