# Project

###  GOAL : Creating a model to predict whether a client will subscribe to a bank term deposit by using some features.

###  DataSet : Binary Classification with a Bank Dataset which is provided by Kaggle

# Imports

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd


# Recognizing the Data

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")

In [3]:
train.head()


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [4]:
test.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


## We need to get rid of [id] columns to get data which is proper to model usage.

In [5]:
train.drop(columns = ["id"],axis = 1,inplace = True)
test.drop(columns = ["id"],axis = 1,inplace = True)


# LabelEncoder

We should convert columns whose type is object to integers by using LabelEncoder

In [6]:
LE = LabelEncoder()
for column in train.columns:
    if train[column].dtype == "object":
        train[column] = LE.fit_transform(train[column].astype(str))

for column in test.columns:
    if test[column].dtype == "object":
        test[column] = LE.fit_transform(test[column].astype(str))



In [7]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,9,1,1,0,7,0,0,0,25,1,117,3,-1,0,3,0
1,38,1,1,1,0,514,0,0,2,18,6,185,1,-1,0,3,0
2,36,1,1,1,0,602,1,0,2,14,8,111,2,-1,0,3,0
3,27,8,2,1,0,34,1,0,2,28,8,10,2,-1,0,3,0
4,26,9,1,1,0,889,1,0,0,3,3,902,1,-1,0,3,1


In [8]:
test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,1,1,1,0,1397,1,0,2,21,8,224,1,-1,0,3
1,44,4,1,2,0,23,1,0,0,3,0,586,2,-1,0,3
2,36,6,1,0,0,46,1,1,0,13,8,111,2,-1,0,3
3,58,1,1,1,0,-1380,1,1,2,29,8,125,1,-1,0,3
4,28,9,2,1,0,1950,1,0,0,22,5,181,1,-1,0,3


# Standartization of Numerical Features

Basic Standartization of numerical features by using StandardScaler

In [9]:
SS = StandardScaler()
y = train["y"]
train  = train.drop(columns = "y")
columns = train.columns
trains= SS.fit_transform(train)
tests = SS.fit_transform(test)
train = pd.DataFrame(trains , columns = columns)
test =  pd.DataFrame(tests , columns  = columns)
train["y"]  = y.reset_index(drop = True) 
train.head()
test.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,-0.886004,-1.01982,-0.276221,-0.325829,-0.130035,0.072797,0.911617,-0.403761,1.46428,0.591383,0.849289,-0.115482,-0.580719,-0.302669,-0.219366,0.318283
1,0.304284,-0.099044,-0.276221,1.089637,-0.130035,-0.428386,0.911617,-0.403761,-0.717526,-1.588192,-1.793005,1.218324,-0.211668,-0.302669,-0.219366,0.318283
2,-0.489241,0.514807,-0.276221,-1.741295,-0.130035,-0.419996,0.911617,2.47671,-0.717526,-0.377317,0.849289,-0.531836,-0.211668,-0.302669,-0.219366,0.318283
3,1.692954,-1.01982,-0.276221,-0.325829,-0.130035,-0.940147,0.911617,2.47671,1.46428,1.560083,0.849289,-0.480252,-0.580719,-0.302669,-0.219366,0.318283
4,-1.282767,1.435583,1.45683,-0.325829,-0.130035,0.27451,0.911617,-0.403761,-0.717526,0.71247,-0.141571,-0.273918,-0.580719,-0.302669,-0.219366,0.318283


# Target

We need to seperate our dataset into target and features.

In [10]:
y = train["y"]
x = train.drop(columns = "y")


# Splitting

In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size = 0.2,random_state = 8,stratify = y)


#  MODELS

We will use three different models to create our main model which is VotingClassifier which is basically use our 3 models as voter to our results.


# CatBoost

## Optuna Optimization for hyperparameters

In [12]:
from catboost import CatBoostClassifier
import optuna
def objective (trial):
    params = {
        'depth' : trial.suggest_int('depth',3,10),
        'learning_rate' : trial.suggest_float('learning_rate',0.005,0.3,log = True),
        'iterations' : trial.suggest_int('iterations',150,1200),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg',0.01,10.0,log= True),
        'bagging_temperature': trial.suggest_float('bagging_temperature',0.0,1.0),
        'random_strength' : trial.suggest_float('random_strength',1e-9,10.0,log =True),
        'border_count' : trial.suggest_int('border_count',32,255),
        'random_state' : 98,
        'verbose' : 150,
        'task_type' :  'GPU'
    }
    model  = CatBoostClassifier(**params)
    model.fit(X_train,Y_train)
    pre = model.predict(X_test)
    return roc_auc_score(Y_test,pre)
    
study = optuna.create_study(direction = 'maximize')
study.optimize(objective,n_trials= 10)

[I 2025-09-04 10:57:28,732] A new study created in memory with name: no-name-2f85b6aa-a22d-4c99-9cdc-7912dc120886


0:	learn: 0.5944167	total: 498ms	remaining: 7m 6s
150:	learn: 0.1630872	total: 1.94s	remaining: 9.09s
300:	learn: 0.1560433	total: 3.3s	remaining: 6.12s
450:	learn: 0.1528178	total: 4.6s	remaining: 4.16s
600:	learn: 0.1507101	total: 5.9s	remaining: 2.52s
750:	learn: 0.1490651	total: 7.19s	remaining: 1.02s
857:	learn: 0.1480694	total: 8.11s	remaining: 0us


[I 2025-09-04 10:57:38,240] Trial 0 finished with value: 0.8156773888792441 and parameters: {'depth': 6, 'learning_rate': 0.059499265937313285, 'iterations': 858, 'l2_leaf_reg': 0.06948542855156382, 'bagging_temperature': 0.5506129606132317, 'random_strength': 0.8768154824742606, 'border_count': 71}. Best is trial 0 with value: 0.8156773888792441.


0:	learn: 0.3222036	total: 27.4ms	remaining: 5.74s
150:	learn: 0.1250224	total: 3.65s	remaining: 1.43s
209:	learn: 0.1162529	total: 5.13s	remaining: 0us


[I 2025-09-04 10:57:44,281] Trial 1 finished with value: 0.8199923391387067 and parameters: {'depth': 10, 'learning_rate': 0.29959744511395037, 'iterations': 210, 'l2_leaf_reg': 0.1668937273858478, 'bagging_temperature': 0.1666761999424462, 'random_strength': 0.00020309471850526916, 'border_count': 53}. Best is trial 1 with value: 0.8199923391387067.


0:	learn: 0.5247504	total: 51.9ms	remaining: 31.4s
150:	learn: 0.1468672	total: 2.64s	remaining: 7.95s
300:	learn: 0.1382170	total: 5.24s	remaining: 5.29s
450:	learn: 0.1311938	total: 7.85s	remaining: 2.68s
600:	learn: 0.1252689	total: 10.6s	remaining: 70.8ms
604:	learn: 0.1250925	total: 10.7s	remaining: 0us


[I 2025-09-04 10:57:56,027] Trial 2 finished with value: 0.8250668351262015 and parameters: {'depth': 9, 'learning_rate': 0.10141987228316374, 'iterations': 605, 'l2_leaf_reg': 0.06773603819885508, 'bagging_temperature': 0.2854659076647994, 'random_strength': 0.00021366045902494204, 'border_count': 249}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6247948	total: 13.1ms	remaining: 11.8s
150:	learn: 0.1648675	total: 1.64s	remaining: 8.2s
300:	learn: 0.1567292	total: 3.27s	remaining: 6.55s
450:	learn: 0.1533099	total: 4.89s	remaining: 4.92s
600:	learn: 0.1510521	total: 6.57s	remaining: 3.31s
750:	learn: 0.1492193	total: 8.26s	remaining: 1.68s
900:	learn: 0.1477595	total: 9.83s	remaining: 32.7ms
903:	learn: 0.1477364	total: 9.86s	remaining: 0us


[I 2025-09-04 10:58:06,753] Trial 3 finished with value: 0.815249421346867 and parameters: {'depth': 7, 'learning_rate': 0.03958543563363598, 'iterations': 904, 'l2_leaf_reg': 0.5111324162315959, 'bagging_temperature': 0.8129520199906884, 'random_strength': 0.22967539798376918, 'border_count': 74}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6741335	total: 7.72ms	remaining: 1.95s
150:	learn: 0.2050859	total: 1.01s	remaining: 685ms
252:	learn: 0.1899851	total: 1.71s	remaining: 0us


[I 2025-09-04 10:58:09,257] Trial 4 finished with value: 0.7196080793993874 and parameters: {'depth': 4, 'learning_rate': 0.01186970092459778, 'iterations': 253, 'l2_leaf_reg': 0.16957307644549158, 'bagging_temperature': 0.2995781911328683, 'random_strength': 0.7680769009780578, 'border_count': 230}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6661938	total: 9.27ms	remaining: 10.7s
150:	learn: 0.1913781	total: 1.14s	remaining: 7.56s
300:	learn: 0.1778379	total: 2.28s	remaining: 6.46s
450:	learn: 0.1706340	total: 3.45s	remaining: 5.39s
600:	learn: 0.1665220	total: 4.61s	remaining: 4.25s
750:	learn: 0.1635604	total: 5.76s	remaining: 3.1s
900:	learn: 0.1616725	total: 6.92s	remaining: 1.95s
1050:	learn: 0.1600988	total: 8.14s	remaining: 805ms
1154:	learn: 0.1591532	total: 9.01s	remaining: 0us


[I 2025-09-04 10:58:19,184] Trial 5 finished with value: 0.8008792699665165 and parameters: {'depth': 5, 'learning_rate': 0.016587878885466187, 'iterations': 1155, 'l2_leaf_reg': 2.654885758504306, 'bagging_temperature': 0.923154779810664, 'random_strength': 2.029245611565738e-05, 'border_count': 70}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6627435	total: 8.59ms	remaining: 4.01s
150:	learn: 0.1890269	total: 1.08s	remaining: 2.26s
300:	learn: 0.1748056	total: 2.21s	remaining: 1.23s
450:	learn: 0.1681359	total: 3.4s	remaining: 128ms
467:	learn: 0.1675845	total: 3.54s	remaining: 0us


[I 2025-09-04 10:58:23,563] Trial 6 finished with value: 0.7848682453022353 and parameters: {'depth': 5, 'learning_rate': 0.018771634040082804, 'iterations': 468, 'l2_leaf_reg': 2.0923449873603452, 'bagging_temperature': 0.06780317278498305, 'random_strength': 0.05938041178888945, 'border_count': 222}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6201980	total: 9.8ms	remaining: 4.76s
150:	learn: 0.1708564	total: 1.16s	remaining: 2.58s
300:	learn: 0.1615624	total: 2.31s	remaining: 1.42s
450:	learn: 0.1569700	total: 3.44s	remaining: 274ms
486:	learn: 0.1562935	total: 3.71s	remaining: 0us


[I 2025-09-04 10:58:28,100] Trial 7 finished with value: 0.8028868141570709 and parameters: {'depth': 5, 'learning_rate': 0.04655832305553515, 'iterations': 487, 'l2_leaf_reg': 0.013009003539584401, 'bagging_temperature': 0.17335245949692457, 'random_strength': 7.284444049767617e-07, 'border_count': 171}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.4074201	total: 9.56ms	remaining: 4.82s
150:	learn: 0.1527783	total: 1.14s	remaining: 2.68s
300:	learn: 0.1477955	total: 2.27s	remaining: 1.54s
450:	learn: 0.1445502	total: 3.42s	remaining: 409ms
504:	learn: 0.1435552	total: 3.82s	remaining: 0us


[I 2025-09-04 10:58:32,773] Trial 8 finished with value: 0.8204610940092909 and parameters: {'depth': 5, 'learning_rate': 0.23373305773948239, 'iterations': 505, 'l2_leaf_reg': 0.02100287333374101, 'bagging_temperature': 0.47844908571202727, 'random_strength': 2.6304924031610712e-05, 'border_count': 248}. Best is trial 2 with value: 0.8250668351262015.


0:	learn: 0.6582158	total: 18.5ms	remaining: 10.8s
150:	learn: 0.1724127	total: 2.64s	remaining: 7.61s
300:	learn: 0.1607516	total: 5.24s	remaining: 4.96s
450:	learn: 0.1552787	total: 7.8s	remaining: 2.34s
585:	learn: 0.1523350	total: 10.2s	remaining: 0us


[I 2025-09-04 10:58:43,914] Trial 9 finished with value: 0.8110544985310668 and parameters: {'depth': 9, 'learning_rate': 0.018675592073336668, 'iterations': 586, 'l2_leaf_reg': 0.7597788892388733, 'bagging_temperature': 0.0016027160584871636, 'random_strength': 6.272474951931787e-05, 'border_count': 40}. Best is trial 2 with value: 0.8250668351262015.


In [13]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(**study.best_params,verbose = 150,random_state = 98,task_type = 'GPU')
cat_model.fit(X_train,Y_train)

0:	learn: 0.5247504	total: 19.6ms	remaining: 11.8s
150:	learn: 0.1468671	total: 2.65s	remaining: 7.98s
300:	learn: 0.1382168	total: 5.25s	remaining: 5.3s
450:	learn: 0.1311935	total: 7.87s	remaining: 2.69s
600:	learn: 0.1253064	total: 10.5s	remaining: 70ms
604:	learn: 0.1251546	total: 10.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f641d995250>

In [14]:
predictions3 = cat_model.predict_proba(X_test)
print("my auc score is",roc_auc_score(Y_test,predictions3[:,1]))


my auc score is 0.9663561385121617


# XGBoost Classifier

## Optuna Optimization for hyperparameters

In [15]:
import optuna
def objective (trial):
    params = {
        'max_depth' : trial.suggest_int('max_depth',3,10),
        'learning_rate' : trial.suggest_float('lr',0.005,0.01),
        'subsample' : trial.suggest_float('ss',0.5,1),
        'n_estimators' : trial.suggest_int('n',600,1200),
        'colsample_bytree': trial.suggest_float('cs',0.5,1),
        'gamma' : trial.suggest_float('g',0,0.5),
        'min_child_weight' : trial.suggest_int('m',1,10),
        'random_state' : 98
    }
    model  = XGBClassifier(**params,
                          device = 'cuda')
    model.fit(X_train,Y_train)
    pre = model.predict(X_test)
    return roc_auc_score(Y_test,pre)
    
study = optuna.create_study(direction = 'maximize')
study.optimize(objective,n_trials= 20)

[I 2025-09-04 10:58:55,636] A new study created in memory with name: no-name-3d4c7d5f-040f-4db6-a131-cd3572077a87
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-09-04 10:59:06,207] Trial 0 finished with value: 0.8073315419300028 and parameters: {'max_depth': 8, 'lr': 0.005665573084662816, 'ss': 0.7171667749642712, 'n': 919, 'cs': 0.7922225990592382, 'g': 0.13992424393120761, 'm': 1}. Best is trial 0 with value: 0.8073315419300028.
[I 2025-09-04 10:59:16,831] Trial 1 finished with value: 0.8162709882097187 and parameters: {'max_depth': 9, 'lr': 0.008415283928864365, 'ss': 0.5152700221442266, 'n': 877, 'cs': 0.6485455393622757, 'g': 0.11487654520177232, 'm': 9}. Best is trial 1 with value: 0.8162709882097187.
[I 2025-09-04 10:59:21,178] Trial 2 finished with value: 0.7449565894552571 and parameters: {'max_depth': 3, 'lr': 0.007243255279446513, 'ss': 0.646223828076935, 'n': 

In [16]:
xgb_model  = XGBClassifier(max_depth =  10
                           , learning_rate =  0.00960008360096706
                           , subsample =  0.7343265983322328
                           , n_estimators= 1036
                           , colsample_bytree =  0.5612644396085253
                           , gamma =  0.3047012122494639
                           , min_child_weight=  4
                           , random_state = 98
                           , device = 'cuda')
xgb_model.fit(X_train,Y_train,verbose = 1)



In [17]:
prediction1 = xgb_model.predict_proba(X_test)
print("My AUC score is",roc_auc_score(Y_test,prediction1[:,1]))


My AUC score is 0.9674605725902193


# LGBM

## Optuna Optimized LGBM too and the parameters below is taken by this optimization by selecting the best hyperparameters.

In [18]:
lgbm_model = LGBMClassifier(
    max_depth=  8, learning_rate= 0.00837639183587293, subsample=0.953557869970604, n_estimators= 847, colsample_bytree=0.808616862983552, gamma= 0.2395225512959197, min_child_weight= 8,
    verbose = -1,random_state = 7
)
lgbm_model.fit(X_train,Y_train)


In [19]:
prediction2 = lgbm_model.predict_proba(X_test)
print("My Auc Score is",roc_auc_score(Y_test,prediction2[:,1]))


My Auc Score is 0.9627338505963965


# Voting Classifier

Gathering all models to get one reasonable solution. 

In [20]:
voting = VotingClassifier(
    estimators = [('cat',cat_model),('xgb',xgb_model),("lgbm",lgbm_model)],
    voting = "soft"
)
voting.fit(X_train,Y_train)

0:	learn: 0.5247504	total: 32.9ms	remaining: 19.9s
150:	learn: 0.1468672	total: 2.91s	remaining: 8.76s
300:	learn: 0.1382169	total: 5.53s	remaining: 5.58s
450:	learn: 0.1311935	total: 8.17s	remaining: 2.79s
600:	learn: 0.1253064	total: 10.8s	remaining: 72ms
604:	learn: 0.1251546	total: 10.9s	remaining: 0us


# StratifiedKfold

By seperating data into bins,we used our model's 5 different versions which are trained with different datasets

In [21]:
from sklearn.model_selection import StratifiedKFold
fold = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 788)
probs = np.zeros(len(test))
for (train,val) in fold.split(x,y):
    X_train1,Y_train1 = x.iloc[train],y.iloc[train]
    X_test1,Y_test1 = x.iloc[val],y.iloc[val]

    voting.fit(X_train1,Y_train1)
    prob = voting.predict_proba(test)
    probs += (prob[:,1])/5
    

0:	learn: 0.5254933	total: 31ms	remaining: 18.7s
150:	learn: 0.1468437	total: 2.77s	remaining: 8.32s
300:	learn: 0.1381800	total: 5.42s	remaining: 5.48s
450:	learn: 0.1314365	total: 8.05s	remaining: 2.75s
600:	learn: 0.1257139	total: 10.7s	remaining: 71.1ms
604:	learn: 0.1255747	total: 10.8s	remaining: 0us
0:	learn: 0.5191955	total: 33.4ms	remaining: 20.2s
150:	learn: 0.1473436	total: 2.91s	remaining: 8.75s
300:	learn: 0.1384337	total: 5.64s	remaining: 5.7s
450:	learn: 0.1316758	total: 8.27s	remaining: 2.82s
600:	learn: 0.1255164	total: 10.9s	remaining: 72.6ms
604:	learn: 0.1253649	total: 11s	remaining: 0us
0:	learn: 0.5246147	total: 32.6ms	remaining: 19.7s
150:	learn: 0.1472923	total: 2.87s	remaining: 8.63s
300:	learn: 0.1386579	total: 5.58s	remaining: 5.63s
450:	learn: 0.1318556	total: 8.24s	remaining: 2.81s
600:	learn: 0.1259093	total: 10.9s	remaining: 72.7ms
604:	learn: 0.1257554	total: 11s	remaining: 0us
0:	learn: 0.5197977	total: 33.3ms	remaining: 20.1s
150:	learn: 0.1465446	tota

# Predictions

In [22]:
sample = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")
sample.drop(columns = "y")
sample["y"] = probs
sample.to_csv('predictions.csv',index = False)


In [23]:
print(sample)

            id         y
0       750000  0.009230
1       750001  0.417918
2       750002  0.001603
3       750003  0.000727
4       750004  0.060234
...        ...       ...
249995  999995  0.000783
249996  999996  0.132990
249997  999997  0.419185
249998  999998  0.002647
249999  999999  0.135780

[250000 rows x 2 columns]
