# Training Final Model and Dump OOB Testing Output

In [17]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import optuna
import lightgbm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import os
import yaml
import sys

sys.path.append('../src')
import utils.utils as utils

!pip freeze > ../requirements.txt

In [18]:
data = pd.read_csv('../data/cleaned_data.csv')

## Training model with optimized params

### Training Model

In [19]:
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

# Double check categorical cols
cat_cols = [col for col in X.columns if X[col].dtype == 'object' or str(X[col].dtype) == 'category']
for col in cat_cols:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=810)

In [20]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=810)

models = []

with open('../configs/model_config.yaml', 'r') as file:
    params = yaml.safe_load(file)

params['objective'] = 'binary'
params['metric'] = 'auc'
params['verbosity'] = -1
params['boosting_type'] = 'gbdt'
params['random_state'] = 810

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"Training Fold {fold+1}")
    X_train_kf, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train_kf, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lightgbm.Dataset(X_train_kf, label=y_train_kf, categorical_feature=cat_cols)
    val_data = lightgbm.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lightgbm.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
        callbacks=[
            lightgbm.early_stopping(stopping_rounds=50),
        ]
    )
    models.append(model)

Training Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[782]	training's auc: 0.854092	valid_1's auc: 0.771922
Training Fold 2
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[981]	training's auc: 0.867867	valid_1's auc: 0.768942
Training Fold 3
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[991]	training's auc: 0.868395	valid_1's auc: 0.773017
Training Fold 4
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.86924	valid_1's auc: 0.769104
Training Fold 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	training's auc: 0.868139	valid_1's auc: 0.783559


### Calculate ROC AUC and GINI for ensembled model

In [21]:
preds = np.zeros(len(X_test))
# Calculate and print the ROC AUC
for model in models:
    preds += model.predict(X_test) / len(models)
roc_auc = roc_auc_score(y_test, preds)

# Calculate GINI
gini = 2 * roc_auc - 1
print(f"ROC AUC: {roc_auc}")
print(f"GINI: {gini}")

ROC AUC: 0.838309631607354
GINI: 0.6766192632147081


In [25]:
for i, model in enumerate(models):
    model.save_model(f'../models/lgbm_fold_{i+1}.txt')

## Predict on Submit Test Set

In [29]:
submit_data = pd.read_csv('../data/cleaned_submit_data.csv')
submit = submit_data.drop(['SK_ID_CURR'], axis=1)

In [30]:
for col in cat_cols:
    submit[col] = submit[col].astype('category')

In [None]:
preds = np.zeros(len(submit))

for model in models:
    preds += model.predict(submit, raw_score=False) / len(models)

In [34]:
# Create new test output with SK_ID_CURR on X set and preds
submit_output = pd.DataFrame({'SK_ID_CURR': submit_data['SK_ID_CURR'], 'TARGET': preds})
from datetime import datetime

# Get the current datetime
current_datetime = datetime.now().strftime('%y-%m-%d_%H-%M-%S')
# Create the filename with the current datetime
filename = f'{current_datetime}_submission.csv'
# Save the DataFrame to a CSV file with the generated filename
submit_output.to_csv(f'../data/dump/{filename}', index=False)