In [79]:
## Importing libraries
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, make_scorer

from xgboost import XGBClassifier

In [80]:
## Reading the data
## Reading the data
file_path = '../Data/train(1).csv'
df = pd.read_csv(file_path)

## Converting the predictor variable to a binary values
df['default_oct'] = df['default_oct'].apply(lambda x: 1 if x == 'yes' else 0)

X = df.drop('default_oct', axis=1)
y = df['default_oct']

In [81]:
X.head()

Unnamed: 0,customer_id,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6
0,1,1500,2.0,1.0,2.0,23.0,0.0,0.0,0.0,2.0,2.0,0.0,1452,1503,1482,1463,938.0,698.0,75,150,86,0,50.0,50.0
1,2,8500,2.0,2.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,8079,8175,8300,8364,8275.0,8425.0,300,400,315,300,325.0,305.0
2,3,1000,1.0,1.0,2.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,733,831,896,933,772.0,794.0,150,150,150,24,105.0,110.0
3,4,10500,1.0,1.0,1.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,7049,7011,7077,7190,7229.0,7340.0,255,260,258,260,265.0,307.0
4,5,10500,2.0,2.0,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,4487,4501,3533,3558,3592.0,3496.0,180,155,145,130,135.0,200.0


In [82]:
y.head()

0    1
1    0
2    0
3    0
4    0
Name: default_oct, dtype: int64

In [83]:
## Splitting the data into training, validation and testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify= y, random_state=7)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, stratify= y_train_val, random_state=7)

In [84]:
## Handling missing values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)

## Applying imputer
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

X_train_imputed.head()

Unnamed: 0,customer_id,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6
0,23265.0,10500.0,1.0,2.0,1.0,44.0,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10569.0,7500.0,2.0,1.0,2.0,29.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1150.0,10500.0,1.0,1.0,2.0,28.0,1.0,-2.0,-2.0,-2.0,-2.0,-1.0,0.0,0.0,0.0,0.0,0.0,413.0,0.0,0.0,0.0,0.0,413.0,0.0
3,6368.0,14500.0,1.0,2.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,17594.0,12708.0,12509.0,11801.0,11072.0,9901.0,550.0,430.0,400.0,426.0,423.0,500.0
4,21190.0,5000.0,1.0,2.0,2.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1067.0,1314.0,1169.0,1268.0,1315.0,2124.0,318.0,219.0,268.0,217.0,1000.0,219.0


In [85]:
# X_train_imputed.isnull().sum()

In [86]:
X_train_imputed = X_train_imputed.drop('customer_id', axis=1)
X_val_imputed = X_val_imputed.drop('customer_id', axis=1)

X_val_imputed.head()

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6
0,14000.0,1.0,1.0,1.0,40.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,265.0,109.0,146.0,478.0,1100.0,261.0,109.0,146.0,478.0,1100.0,261.0,98.0
1,9000.0,1.0,2.0,1.0,28.0,0.0,0.0,0.0,0.0,2.0,0.0,7436.0,7538.0,7816.0,8284.0,8279.0,8447.0,300.0,400.0,675.0,205.0,310.0,680.0
2,2500.0,1.0,2.0,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,2416.0,2458.0,2487.0,958.0,974.0,978.0,115.0,110.0,50.0,50.0,40.0,35.0
3,4500.0,1.0,2.0,2.0,29.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,3527.0,2577.0,0.0,0.0,0.0,0.0,150.0,0.0,0.0,0.0,0.0,0.0
4,18000.0,1.0,3.0,1.0,34.0,1.0,1.0,-1.0,-1.0,0.0,0.0,13852.0,-1148.0,16930.0,1148.0,1148.0,0.0,0.0,18078.0,1148.0,0.0,0.0,0.0


In [87]:
## Scaling the dataset 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

In [88]:
# Calculate scale_pos_weight
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

In [89]:
# # Feature Selection using RFE with cross-validation
# xgb_feature_selector = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
# rfecv = RFECV(estimator=xgb_feature_selector, step=1, cv=StratifiedKFold(3), scoring='f1', n_jobs=-1)
# rfecv.fit(X_train_scaled, y_train)

# # Selected features
# selected_features = rfecv.support_

# print(selected_features)

# X_train_scaled = X_train_scaled[:, selected_features]
# X_val_scaled = X_val_scaled[:, selected_features]

In [90]:
## Building Model on the Training Data
# Handling imbalanced data using undersampling
# rus = RandomUnderSampler(random_state=7)
# X_train_scaled, y_train = rus.fit_resample(X_train_scaled, y_train)



# Define the model
xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
# xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Hyperparameter optimization using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.03, 0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 7, 9],
    'colsample_bytree': [0.3, 0.7],
    'subsample': [0.8, 1.0]
}

# Use stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)

In [91]:
# Define the scorer
f1_scorer = make_scorer(f1_score)

In [92]:
# Randomized search
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, scoring=f1_scorer, cv=skf, n_iter=10, random_state=7, verbose=1, n_jobs=-1)
random_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Best parameters: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}


In [93]:
# Train the final model with the best parameters
final_model = XGBClassifier(**best_params, objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight)
# final_model = XGBClassifier(**best_params, objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')
final_model.fit(X_train_scaled, y_train)

In [94]:
# Predictions on the validation set
y_val_pred = final_model.predict(X_val_scaled)
f1 = f1_score(y_val, y_val_pred)
print(f"Validation F1 Score: {f1}")

Validation F1 Score: 0.5325443786982248
