In [1]:
!pip install imbalanced-learn==0.8.0



In [2]:
!pip install lightgbm==3.0.0



In [4]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot
from matplotlib.pyplot import figure
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from math import sqrt

from imblearn.over_sampling import SMOTE

## Settings

In [5]:
RAW_DATA_DIR = "./../data/raw"
INTERIM_DATA_DIR = "./../data/interim"
PROCESSED_DATA_DIR = "./../data/processed"
VERSION = "v1.0"
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
matplotlib.rcParams.update(params)

## Load Processed Data

In [6]:
rmoutliers_scaled_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-MinMaxScaled-v1.0.csv')
rmoutliers_df = pd.read_csv(f'{INTERIM_DATA_DIR}/Remove-Outliers-v1.0.csv')

## Train Test Split

In [10]:
X = rmoutliers_scaled_df[[
       'NumberOfTime30-59DaysPastDueNotWorse',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']] # Based on analysis from recursive feature elimination and xgb feature importance
y = rmoutliers_scaled_df['SeriousDlqin2yrs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_resampled, y_resampled = SMOTE(sampling_strategy='minority', random_state=123).fit_resample(X_train, y_train)

In [26]:
# Specifying the parameter
params={}
params['learning_rate']=0.05
params['feature_fraction']=1.0
params['bagging_fraction']=0.5
params['min_data_in_leaf']=30
params['boosting_type']='gbdt'
params['objective']='binary'
params['metric']='auc'
num_round = 50

train_data=lgb.Dataset(X_resampled, label=y_resampled)

model = lgb.train(params, train_data, num_round)

[LightGBM] [Info] Number of positive: 80422, number of negative: 80422
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1187
[LightGBM] [Info] Number of data points in the train set: 160844, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [27]:
y_pred_prob = model.predict(X_test)

In [28]:
y_pred_prob

array([0.24759382, 0.30018561, 0.88386831, ..., 0.61718717, 0.29761186,
       0.33041858])

In [29]:
print(f"AUROC: {roc_auc_score(y_test, y_pred_prob):.2f}%")

AUROC: 0.80%
