# Training Final Model and Dump OOB Testing Output

In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import optuna
import lightgbm
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import os
import yaml
import sys

sys.path.append('../src')
import utils.utils as utils

!pip freeze > ../requirements.txt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('../data/cleaned_train_set.csv')

## Training model with optimized params

### Training Model

In [4]:
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

# Double check categorical cols
cat_cols = [col for col in X.columns if X[col].dtype == 'object' or str(X[col].dtype) == 'category']
for col in cat_cols:
    X[col] = X[col].astype('category')

kf = KFold(n_splits=5, shuffle=True, random_state=810)

models = []

with open('../configs/model_config.yaml', 'r') as file:
    params = yaml.safe_load(file)

params['objective'] = 'binary'
params['metric'] = 'auc'
params['verbosity'] = -1
params['boosting_type'] = 'gbdt'
params['random_state'] = 810

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lightgbm.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
    val_data = lightgbm.Dataset(X_val, label=y_val, reference=train_data)

    model = lightgbm.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
        callbacks=[
            lightgbm.early_stopping(stopping_rounds=50)
        ]
    )
    models.append(model)

Training Fold 1
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.85563	valid_1's auc: 0.775479
Training Fold 2
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[992]	training's auc: 0.855465	valid_1's auc: 0.7755
Training Fold 3
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.855515	valid_1's auc: 0.775495
Training Fold 4
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[992]	training's auc: 0.855295	valid_1's auc: 0.776714
Training Fold 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	training's auc: 0.854787	valid_1's auc: 0.773815


### Calculate ROC AUC and GINI for ensembled model

In [5]:
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

preds = np.zeros(len(X_val))
# Calculate and print the ROC AUC
for model in models:
    preds += model.predict(X_val) / len(models)
roc_auc = roc_auc_score(y_val, preds)

# Calculate GINI
gini = 2 * roc_auc - 1
print(f"ROC AUC: {roc_auc}")
print(f"GINI: {gini}")

ROC AUC: 0.8427609536399404
GINI: 0.6855219072798808


In [7]:
for i, model in enumerate(models):
    model.save_model(f'../model/model_fold_{i+1}.txt')

## Predict on OOB Test Set

In [24]:
oob_set = pd.read_csv('../data/cleaned_oob_set.csv')
X_test = oob_set.drop(['SK_ID_CURR'], axis=1)

In [25]:
for col in cat_cols:
    X_test[col] = X_test[col].astype('category')

In [26]:
# Initialize preds for test set
preds = np.zeros(len(X_test))

# Calculate and print the ROC AUC
for model in models:
    preds += model.predict(X_test, raw_score=False) / len(models)

In [27]:
# Create new test output with SK_ID_CURR on X set and preds
test_output = pd.DataFrame({'SK_ID_CURR': oob_set['SK_ID_CURR'], 'TARGET': preds})
test_output.to_csv('../data/dump/submission.csv', index=False)