In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('../data/bank.csv', sep=';')  # Original file uses semicolon
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [13]:
binary_cols = ['default', 'housing', 'loan', 'y']
df[binary_cols] = df[binary_cols].map(lambda x: 1 if x == 'yes' else 0)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0,2143,1,0,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,0,29,1,0,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,0,2,1,1,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,0,1506,1,0,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,0,1,0,0,unknown,5,may,198,1,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,0,825,0,0,cellular,17,nov,977,3,-1,0,unknown,1
45207,71,retired,divorced,primary,0,1729,0,0,cellular,17,nov,456,2,-1,0,unknown,1
45208,72,retired,married,secondary,0,5715,0,0,cellular,17,nov,1127,5,184,3,success,1
45209,57,blue-collar,married,secondary,0,668,0,0,telephone,17,nov,508,4,-1,0,unknown,0


In [14]:
categorical_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,58,0,2143,1,0,5,261,1,-1,0,...,False,False,False,True,False,False,False,False,False,True
1,44,0,29,1,0,5,151,1,-1,0,...,False,False,False,True,False,False,False,False,False,True
2,33,0,2,1,1,5,76,1,-1,0,...,False,False,False,True,False,False,False,False,False,True
3,47,0,1506,1,0,5,92,1,-1,0,...,False,False,False,True,False,False,False,False,False,True
4,33,0,1,0,0,5,198,1,-1,0,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,17,977,3,-1,0,...,False,False,False,False,True,False,False,False,False,True
45207,71,0,1729,0,0,17,456,2,-1,0,...,False,False,False,False,True,False,False,False,False,True
45208,72,0,5715,0,0,17,1127,5,184,3,...,False,False,False,False,True,False,False,False,True,False
45209,57,0,668,0,0,17,508,4,-1,0,...,False,False,False,False,True,False,False,False,False,True


In [15]:
df['previous_contact'] = df['pdays'].apply(lambda x: 0 if x == -1 else 1)
df.drop(columns=['duration', 'pdays'], inplace=True)
df

Unnamed: 0,age,default,balance,housing,loan,day,campaign,previous,y,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,previous_contact
0,58,0,2143,1,0,5,1,0,0,False,...,False,False,True,False,False,False,False,False,True,0
1,44,0,29,1,0,5,1,0,0,False,...,False,False,True,False,False,False,False,False,True,0
2,33,0,2,1,1,5,1,0,0,False,...,False,False,True,False,False,False,False,False,True,0
3,47,0,1506,1,0,5,1,0,0,True,...,False,False,True,False,False,False,False,False,True,0
4,33,0,1,0,0,5,1,0,0,False,...,False,False,True,False,False,False,False,False,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,17,3,0,1,False,...,False,False,False,True,False,False,False,False,True,0
45207,71,0,1729,0,0,17,2,0,1,False,...,False,False,False,True,False,False,False,False,True,0
45208,72,0,5715,0,0,17,5,3,1,False,...,False,False,False,True,False,False,False,True,False,1
45209,57,0,668,0,0,17,4,0,0,True,...,False,False,False,True,False,False,False,False,True,0


In [16]:
df['age_group'] = pd.cut(df['age'], bins=[18, 25, 35, 45, 55, 65, 100], labels=['18-25','26-35','36-45','46-55','56-65','65+'])
df = pd.get_dummies(df, columns=['age_group'], drop_first=True)
df

Unnamed: 0,age,default,balance,housing,loan,day,campaign,previous,y,job_blue-collar,...,month_sep,poutcome_other,poutcome_success,poutcome_unknown,previous_contact,age_group_26-35,age_group_36-45,age_group_46-55,age_group_56-65,age_group_65+
0,58,0,2143,1,0,5,1,0,0,False,...,False,False,False,True,0,False,False,False,True,False
1,44,0,29,1,0,5,1,0,0,False,...,False,False,False,True,0,False,True,False,False,False
2,33,0,2,1,1,5,1,0,0,False,...,False,False,False,True,0,True,False,False,False,False
3,47,0,1506,1,0,5,1,0,0,True,...,False,False,False,True,0,False,False,True,False,False
4,33,0,1,0,0,5,1,0,0,False,...,False,False,False,True,0,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,17,3,0,1,False,...,False,False,False,True,0,False,False,True,False,False
45207,71,0,1729,0,0,17,2,0,1,False,...,False,False,False,True,0,False,False,False,False,True
45208,72,0,5715,0,0,17,5,3,1,False,...,False,False,True,False,1,False,False,False,False,True
45209,57,0,668,0,0,17,4,0,0,True,...,False,False,False,True,0,False,False,False,True,False


In [17]:
df['wealthy'] = df.apply(lambda row: 1 if row['balance'] > 1500 and row['loan'] == 0 and row['housing'] == 0 else 0, axis=1)
df

Unnamed: 0,age,default,balance,housing,loan,day,campaign,previous,y,job_blue-collar,...,poutcome_other,poutcome_success,poutcome_unknown,previous_contact,age_group_26-35,age_group_36-45,age_group_46-55,age_group_56-65,age_group_65+,wealthy
0,58,0,2143,1,0,5,1,0,0,False,...,False,False,True,0,False,False,False,True,False,0
1,44,0,29,1,0,5,1,0,0,False,...,False,False,True,0,False,True,False,False,False,0
2,33,0,2,1,1,5,1,0,0,False,...,False,False,True,0,True,False,False,False,False,0
3,47,0,1506,1,0,5,1,0,0,True,...,False,False,True,0,False,False,True,False,False,0
4,33,0,1,0,0,5,1,0,0,False,...,False,False,True,0,True,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,0,825,0,0,17,3,0,1,False,...,False,False,True,0,False,False,True,False,False,0
45207,71,0,1729,0,0,17,2,0,1,False,...,False,False,True,0,False,False,False,False,True,1
45208,72,0,5715,0,0,17,5,3,1,False,...,False,True,False,1,False,False,False,False,True,1
45209,57,0,668,0,0,17,4,0,0,True,...,False,False,True,0,False,False,False,True,False,0


In [18]:
from sklearn.model_selection import train_test_split

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [19]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)