In [118]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e8/sample_submission.csv
/kaggle/input/playground-series-s5e8/train.csv
/kaggle/input/playground-series-s5e8/test.csv


In [119]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, accuracy_score,precision_score, recall_score, f1_score,confusion_matrix, classification_report)


# Loading and Inspection of Dataset

In [120]:
# Loading the train data
df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')

In [121]:
df.sample(5)

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
260291,260291,31,blue-collar,married,secondary,no,0,no,yes,cellular,7,jul,117,1,-1,0,unknown,0
274402,274402,38,admin.,single,secondary,no,0,no,no,unknown,9,jun,95,1,-1,0,unknown,0
260471,260471,35,blue-collar,divorced,secondary,no,0,no,no,unknown,20,jun,16,18,-1,0,unknown,0
17965,17965,31,technician,married,tertiary,no,431,yes,no,cellular,21,nov,12,8,-1,0,unknown,0
386843,386843,41,management,married,tertiary,no,829,yes,yes,cellular,25,aug,109,8,-1,0,unknown,0


In [122]:
# Checking the dataset whether the dataset is balanced or not
df['y'].value_counts(normalize=True)*100

y
0    87.934933
1    12.065067
Name: proportion, dtype: float64

**Nearly 88% of samples are from class 0 and 12% of samples are from class 1 . Since the dataset is not severe imbalanced. So , we will not treat it as a imbalanced data.**

In [123]:
# checking missing values
df.isna().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

**NO Missing value found in the dataset .**

In [124]:
# Checking Duplicated Present or not
df.duplicated().sum()

np.int64(0)

**NO Duplicate Found**

In [125]:
# Numerical Features
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
print('Numerical Features :',num_cols)

# Categorical Columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print('\nCategorical Features :',cat_cols)

Numerical Features : ['id', 'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']

Categorical Features : ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [126]:
# Loading the test data
test_df =pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')

# Data Preprocessing

In [127]:
# Month columns mapping
month_map = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
    'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
    'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df['month'] = df['month'].map(month_map)

bin_cols = ['default' , 'housing' , 'loan']

df[bin_cols] = df[bin_cols].replace({'no':0 , 'yes':1})


  df[bin_cols] = df[bin_cols].replace({'no':0 , 'yes':1})


In [128]:
# Month columns mapping
test_df['month'] = test_df['month'].map(month_map)

# Binary columns mapping
test_df[bin_cols] = test_df[bin_cols].replace({'no': 0, 'yes': 1})


  test_df[bin_cols] = test_df[bin_cols].replace({'no': 0, 'yes': 1})


In [129]:
# Dropping the id columns from the test data
X_test_kaggle = test_df.drop(columns=['id'])


In [130]:
X = df.drop(columns=['id','y'])
y = df['y']

# Data Splitting

In [131]:
# Train test split
X_train ,X_test, y_train ,y_test = train_test_split(X , y , test_size = 0.2 , stratify = y , random_state = 42)

# Data Preprocessing and Modeling using sklearn pipeline

In [132]:
# Creating a pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cat_cols = ['job', 'marital', 'education', 'contact', 'poutcome']
num_cols = ['age', 'balance', 'duration', 'month']

# Performing the OneHotEncoding on Categorical Features

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

In [133]:
# Applying the Xgboost Model
from xgboost import XGBClassifier

neg, pos = y_train.value_counts()
scale_pos_weight = neg / pos

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='auc',
    random_state=42
)

In [134]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('prep' , preprocessor),
    ('model', xgb)
])

In [135]:
pipeline.fit(X_train , y_train)

# Result and Analysis

In [136]:
# Probabilities
y_train_proba = pipeline.predict_proba(X_train)[:, 1]
y_test_proba  = pipeline.predict_proba(X_test)[:, 1]

print(f"Train AUC: {roc_auc_score(y_train, y_train_proba):.5f}")
print(f"Test  AUC: {roc_auc_score(y_test, y_test_proba):.5f}")

Train AUC: 0.96065
Test  AUC: 0.95835


In [137]:
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_test, y_test_proba)
f1 = 2 * prec * rec / (prec + rec + 1e-9)
best_thr = thr[np.argmax(f1)]
best_thr

np.float32(0.7827006)

In [138]:
# Class predictions 
y_train_pred_best = (y_train_proba_best >= best_thr).astype(int)
y_test_pred_best  = (y_test_proba_best >= best_thr).astype(int)

In [139]:
def print_metrics(y_true, y_pred, y_proba, name):
    print(f"\n===== {name} METRICS =====")
    print(f"AUC       : {roc_auc_score(y_true, y_proba):.5f}")
    print(f"Accuracy  : {accuracy_score(y_true, y_pred):.5f}")
    print(f"Precision : {precision_score(y_true, y_pred):.5f}")
    print(f"Recall    : {recall_score(y_true, y_pred):.5f}")
    print(f"F1-score  : {f1_score(y_true, y_pred):.5f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))


In [140]:
print_metrics(y_train, y_train_pred, y_train_proba, "TRAIN")
print_metrics(y_test, y_test_pred, y_test_proba, "TEST")



===== TRAIN METRICS =====
AUC       : 0.96065
Accuracy  : 0.85278
Precision : 0.44731
Recall    : 0.93483
F1-score  : 0.60509

Confusion Matrix:
[[443997  83613]
 [  4718  67672]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.84      0.91    527610
           1       0.45      0.93      0.61     72390

    accuracy                           0.85    600000
   macro avg       0.72      0.89      0.76    600000
weighted avg       0.92      0.85      0.87    600000


===== TEST METRICS =====
AUC       : 0.95835
Accuracy  : 0.85197
Precision : 0.44559
Recall    : 0.92927
F1-score  : 0.60235

Confusion Matrix:
[[110977  20925]
 [  1280  16818]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.84      0.91    131902
           1       0.45      0.93      0.60     18098

    accuracy                           0.85    150000
   macro avg       0.72      0.89      0.76   

In [154]:
test_pred = pipeline.predict_proba(X_test_kaggle)[:, 1]


In [155]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'y': test_pred
})


In [156]:
# Saving csv
submission.to_csv('submission.csv', index=False)


[CV] END model__colsample_bytree=0.8, model__gamma=0, model__max_depth=5, model__min_child_weight=5, model__reg_alpha=1, model__reg_lambda=2, model__subsample=1.0; total time=  30.2s
[CV] END model__colsample_bytree=0.8, model__gamma=0.2, model__max_depth=4, model__min_child_weight=10, model__reg_alpha=1, model__reg_lambda=3, model__subsample=0.8; total time=  27.2s
[CV] END model__colsample_bytree=0.8, model__gamma=0.2, model__max_depth=4, model__min_child_weight=10, model__reg_alpha=1, model__reg_lambda=3, model__subsample=0.8; total time=  27.2s
[CV] END model__colsample_bytree=0.8, model__gamma=0.1, model__max_depth=5, model__min_child_weight=10, model__reg_alpha=1, model__reg_lambda=2, model__subsample=1.0; total time=  30.0s
[CV] END model__colsample_bytree=0.8, model__gamma=0.1, model__max_depth=5, model__min_child_weight=5, model__reg_alpha=0.5, model__reg_lambda=3, model__subsample=0.6; total time=  31.1s
[CV] END model__colsample_bytree=1.0, model__gamma=0, model__max_depth=6

# Hyperparamter Tuning

In [142]:
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV , StratifiedKFold

#Defining Stratified k-fold
skf = StratifiedKFold(
    n_splits = 5,
    shuffle = True,
    random_state= 42
)

# Defining Param grid
param_grid = {
    'model__max_depth': [4, 5, 6],
    'model__min_child_weight': [1, 5, 10],
    'model__gamma': [0, 0.1, 0.2],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0],
    'model__reg_alpha': [0, 0.5, 1],
    'model__reg_lambda': [1, 2, 3]
}


# Randomised Search
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20,
    scoring='roc_auc',
    cv=skf,
    n_jobs=-1,
    refit=True,
    verbose=2,
    random_state=42
)

search.fit(X_train, y_train)




Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [143]:
search.best_score_

np.float64(0.9580208303215463)

In [144]:
search.best_params_

{'model__subsample': 0.6,
 'model__reg_lambda': 1,
 'model__reg_alpha': 0,
 'model__min_child_weight': 10,
 'model__max_depth': 6,
 'model__gamma': 0.1,
 'model__colsample_bytree': 0.6}

In [145]:
 search.best_estimator_

In [146]:
best_pipeline = search.best_estimator_
best_pipeline.fit(X_train, y_train)


In [147]:
# Probabilities
y_train_proba_best = best_pipeline.predict_proba(X_train)[:, 1]
y_test_proba_best  = best_pipeline.predict_proba(X_test)[:, 1]

print(f"Train AUC: {roc_auc_score(y_train, y_train_proba_best):.5f}")
print(f"Test  AUC: {roc_auc_score(y_test, y_test_proba_best):.5f}")

Train AUC: 0.95997
Test  AUC: 0.95828


In [148]:
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_test, y_test_proba_best)
f1 = 2 * prec * rec / (prec + rec + 1e-9)
tune_best_thr = thr[np.argmax(f1)]
tune_best_thr

np.float32(0.79023457)

In [149]:
# Class predictions 
y_train_pred_best = (y_train_proba_best >= tune_best_thr).astype(int)
y_test_pred_best  = (y_test_proba_best >= tune_best_thr).astype(int)

In [150]:
print_metrics(y_train, y_train_pred_best, y_train_proba_best, "TRAIN")
print_metrics(y_test, y_test_pred_best, y_test_proba_best, "TEST")



===== TRAIN METRICS =====
AUC       : 0.95997
Accuracy  : 0.92231
Precision : 0.65460
Recall    : 0.75385
F1-score  : 0.70073

Confusion Matrix:
[[498816  28794]
 [ 17819  54571]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96    527610
           1       0.65      0.75      0.70     72390

    accuracy                           0.92    600000
   macro avg       0.81      0.85      0.83    600000
weighted avg       0.93      0.92      0.92    600000


===== TEST METRICS =====
AUC       : 0.95828
Accuracy  : 0.92133
Precision : 0.65186
Recall    : 0.74688
F1-score  : 0.69614

Confusion Matrix:
[[124683   7219]
 [  4581  13517]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95    131902
           1       0.65      0.75      0.70     18098

    accuracy                           0.92    150000
   macro avg       0.81      0.85      0.83   

In [151]:
test_pred_best = best_pipeline.predict_proba(X_test_kaggle)[:, 1]

In [152]:
tuned_submission = pd.DataFrame({
    'id': test_df['id'],
    'y': test_pred_best
})

In [153]:
tuned_submission.to_csv('tune_submission.csv', index=False)

**The tuned XGBoost model is the most appropriate choice because it provides the best balance between precision, recall, and generalization while maintaining a strong AUC.**