In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import optuna
import lightgbm
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import os
import yaml
import sys

sys.path.append('../src')
import utils.utils as utils

!pip freeze > ../requirements.txt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('../data/cleaned_data.csv')

In [4]:
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = data['TARGET']

# Double check categorical cols
cat_cols = [col for col in X.columns if X[col].dtype == 'object' or str(X[col].dtype) == 'category']
for col in cat_cols:
    X[col] = X[col].astype('category')

kf = KFold(n_splits=5, shuffle=True, random_state=810)

models = []

with open('../configs/model_config.yaml', 'r') as file:
    params = yaml.safe_load(file)

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lightgbm.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
    val_data = lightgbm.Dataset(X_val, label=y_val, reference=train_data)

    model = lightgbm.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
        callbacks=[
            lightgbm.early_stopping(stopping_rounds=50)
        ]
    )
    models.append(model)

Training Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.204219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26146
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 211
[LightGBM] [Info] Start training from score 0.080697
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's l2: 0.0610107	valid_1's l2: 0.0669988
Training Fold 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26222
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 210
[LightGBM] [Info] Start training from score 0.080521
Training until validation scores don't improve fo

In [5]:
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
preds = np.zeros(len(X_val))
# Calculate and print the ROC AUC
for model in models:
    preds += model.predict(X_val) / len(models)

roc_auc = roc_auc_score(y_val, preds)
# Calculate GINI
gini = 2 * roc_auc - 1
print(f"ROC AUC: {roc_auc}")
print(f"GINI: {gini}")

ROC AUC: 0.8315496485073146
GINI: 0.6630992970146292


In [6]:
# Test on OOB set
oob_set = pd.read_csv('../data/backup/oob_set.csv').drop(['SK_ID_CURR.1', 'RN', 'SK_ID_CURR.2', 'RN.1'], axis=1)

In [7]:
def auto_transform_object(df, max_unique=10, output_path = "."):
    object_col = []
    log_binary_bin = []
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            object_col.append(col)
    for col in object_col:
        nunique = df[col].nunique()
        if nunique == 2:
            vals = list(df[col].unique())
            df[col] = np.where(df[col] == vals[0], 1, np.where(df[col] == vals[1], 0, np.nan))
            log_binary_bin.append([col, vals[0], vals[1]])
        elif nunique > 2 and nunique < max_unique:
            df[col] = df[col].astype('category')
        elif nunique > max_unique:
            print(f'{col} has more than {max_unique} unique values, bypass auto transform.')
    pd.DataFrame(log_binary_bin, columns=['column', 'bin_1', 'bin_0']).to_csv(output_path, index=False)

auto_transform_object(oob_set, max_unique=10, output_path = '../log/log_binary_bin_oob_set.csv')

OCCUPATION_TYPE has more than 10 unique values, bypass auto transform.
ORGANIZATION_TYPE has more than 10 unique values, bypass auto transform.


In [8]:
oob_set['CODE_GENDER'] = np.where(oob_set['CODE_GENDER'] == 'XNA', np.nan, oob_set['CODE_GENDER'])
oob_set['CODE_GENDER'] = (oob_set['CODE_GENDER'] == 'M').astype(int)

oob_set['NAME_INCOME_TYPE'] = oob_set['NAME_INCOME_TYPE'].apply(lambda x: 'Others' if x in ['Unemployed', 'Student', 'Businessman', 'Maternity leave'] else x)
oob_set['NAME_INCOME_TYPE'] = oob_set['NAME_INCOME_TYPE'].astype('category')

oob_set['IS_UNACCOMPANIED'] = np.where(oob_set['NAME_TYPE_SUITE'] == 'Unaccompanied', 1, np.where(oob_set['NAME_TYPE_SUITE'].isna(), np.nan, 0))

oob_set['IS_SINGLE'] = np.where(oob_set['NAME_FAMILY_STATUS'] == 'Single / not married', 1, np.where(oob_set['NAME_FAMILY_STATUS'] == 'Unknown', np.nan, 0))

oob_set['OCCUPATION_TYPE'] = oob_set['OCCUPATION_TYPE'].astype('category')

def category(val):
    if 'Industry' in val: return 'industry'
    if 'Trade' in val: return 'trade'
    if 'Transport' in val: return 'transport'
    if 'Business Entity' in val: return 'business'
    if val in ['XNA']: return 'unknown'
    if val in ['Self-employed']: return 'self-employed'
    if val in ['Other']: return 'others_1'
    else : return 'others_2'
    return val
oob_set['ORGANIZATION_TYPE'] = oob_set['ORGANIZATION_TYPE'].apply(category)
oob_set['ORGANIZATION_TYPE'] = oob_set['ORGANIZATION_TYPE'].astype('category')

In [9]:
oob_set['DAYS_EMPLOYED'] = np.where(oob_set['DAYS_EMPLOYED'] == '365243', np.nan, np.where(oob_set['DAYS_EMPLOYED'].isna(), np.nan, oob_set['DAYS_EMPLOYED']))

oob_set['OWN_CAR_OVER_40Y'] = np.where(oob_set['OWN_CAR_AGE'] >= 40, 1, 0)
oob_set['OWN_CAR_AGE'] = np.where(oob_set['OWN_CAR_AGE'] >= 40, 40, np.where(oob_set['OWN_CAR_AGE'].isna(), np.nan, oob_set['OWN_CAR_AGE']))

oob_set['AMT_INCOME_TOTAL'] = np.where(oob_set['AMT_INCOME_TOTAL'] == 117000000, 202500, oob_set['AMT_INCOME_TOTAL'])

In [None]:
# oob_set.to_csv('../data/cleaned_oob_data.csv', index=False)

In [None]:
X_test = oob_set.drop(['SK_ID_CURR'], axis=1)

# Initialize preds for test set
preds = np.zeros(len(X_test))

# Calculate and print the ROC AUC
for model in models:
    preds += model.predict(X_test) / len(models)

In [17]:
# Create new test output with SK_ID_CURR on X set and preds
test_output = pd.DataFrame({'SK_ID_CURR': oob_set['SK_ID_CURR'], 'TARGET': preds})
test_output.to_csv('../data/test_output.csv', index=False)