In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#<font color='orange'>Basics</font>

In [16]:
#@title **Location** of the dataset
path =  "../data/HomeCredit/"
process_path = "../data/ProcessedData/"
save_path = "../tests/"
ri_datasets_path = "../data/riData/"

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import secrets
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from scipy.stats import ks_2samp
from lightgbm import LGBMClassifier
from pathlib import Path
from sklearn.metrics import (roc_auc_score)
from sklearn.model_selection import KFold


In [18]:
from sklearn.metrics import (accuracy_score, balanced_accuracy_score,
                            f1_score, precision_score, recall_score,
                            roc_auc_score, roc_curve)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.semi_supervised import LabelSpreading

In [20]:
import credit_pipeline.data_exploration as dex
import credit_pipeline.training as tr
import credit_pipeline.reject_inference as ri

In [3]:
import submodules.TOPCIS_Python as top

ModuleNotFoundError: No module named 'submodules.TOPCIS_Python'

#Read Dataset

In [21]:
#@title Read dataset
df_o = pd.read_csv(path+'application_train.csv')    #HomeCredit training dataset

In [22]:
#@title Create develoment train and test
df_train, df_test = tr.create_train_test(df_o, seed=43)

In [23]:
#@title Set seed
new_seed = False #@param {type:"boolean"}

if new_seed:
    seed_number = secrets.randbelow(1_000) #to name the results files

    while seed_number <100:
        seed_number = secrets.randbelow(1_000)
else:
    seed_number = 000

print(seed_number)

0


#Params

In [24]:
params_dict = ri.params_dict

In [25]:
params_dict['LightGBM_2'] = {'boosting_type': 'gbdt', 'class_weight': None,
              'colsample_bytree': 0.22534977954592625, 'importance_type': 'split',
              'learning_rate': 0.052227873762946964, 'max_depth': 5,
              'min_child_samples': 26, 'min_child_weight': 0.001,
              'min_split_gain': 0.0, 'n_estimators': 159, 'n_jobs': -1,
              'num_leaves': 12, 'objective': None, 'random_state': seed_number,
              'reg_alpha': 0.7438345471808012, 'reg_lambda': 0.46164693905368515,
                'verbose': -1, 'subsample': 0.8896599304061413,
              'subsample_for_bin': 200000, 'subsample_freq': 0,
              'is_unbalance': True}

#<font color='orange'>Helper Functions</font>


#<font color='red'>Definition of Train and Test Val, and Unl</font>

In [26]:
N_splits=5  
kf = KFold(n_splits=N_splits)   #80-20 split for train-test
hist_dict = {}

for fold_number, (train_index, test_index) in enumerate(kf.split(df_o)):
    #diferent seed for each iteration
    seed_number = seed_number+fold_number

    df_train = df_o.iloc[train_index]
    df_test = df_o.iloc[test_index]
    
    val_split = int(df_train.shape[0] * 0.2)  #80-20 split for train-validation
    df_val = df_train.iloc[:val_split]
    df_train = df_train.iloc[val_split:]

    df_train, policy_model = ri.fit_policy(df_train)

    X_train, X_test, X_val = df_train, df_test, df_val
    y_train, y_test, y_val = df_train["TARGET"], df_test["TARGET"], df_val["TARGET"]

    X_train_acp, X_train_rej, y_train_acp, y_train_rej = ri.accept_reject_split(X_train, y_train, policy_clf=policy_model)
    X_test_acp, X_test_rej, y_test_acp, y_test_rej = ri.accept_reject_split(X_test, y_test, policy_clf=policy_model)
    X_val_acp, X_val_rej, y_val_acp, y_val_rej = ri.accept_reject_split(X_val, y_val, policy_clf=policy_model)
    
    models_dict = {}

    benchmark = tr.create_pipeline(X_train_acp, y_train_acp,
                                  LGBMClassifier(**params_dict['LightGBM_2']))
    benchmark.fit(X_train_acp, y_train_acp)

    
    models_dict['BM'] = benchmark

    models_dict.update(
        ri.augmentation_with_soft_cutoff(X_train_acp, y_train_acp, X_train_rej, seed = seed_number))
    models_dict.update(
        ri.augmentation(X_train_acp, y_train_acp, X_train_rej, mode='up', seed = seed_number))
    models_dict.update(
        ri.augmentation(X_train_acp, y_train_acp, X_train_rej, mode='down', seed = seed_number))
    models_dict.update(
        ri.fuzzy_augmentation(X_train_acp, y_train_acp, X_train_rej, seed = seed_number))
    models_dict.update(
        ri.extrapolation(X_train_acp, y_train_acp, X_train_rej, seed = seed_number))
    models_dict.update(
        ri.parcelling(X_train_acp, y_train_acp, X_train_rej, seed = seed_number))
    models_dict.update(
        ri.label_spreading(X_train_acp, y_train_acp, X_train_rej, seed = seed_number))
    models_dict.update(
        ri.trusted_non_outliers(X_train_acp, y_train_acp, X_train_rej, iterations=60,seed=seed_number))

    hist_dict[fold_number] = ri.get_metrics_RI(models_dict, X_test_acp, y_test_acp, X_val_acp, y_val_acp, X_test_rej)

In [27]:
hist_dict[fold_number]

Unnamed: 0,BM,A-SC,A-UW,A-DW,A-FU,E-C,PAR,LSP,TN
Overall AUC,0.732541,0.72879,0.734104,0.729383,0.698547,0.724357,0.725075,0.722428,0.720023
KS,0.352139,0.350976,0.359803,0.364534,0.301633,0.343917,0.343133,0.34133,0.331394
Balanced Accuracy,0.67042,0.665003,0.672897,0.678244,0.641457,0.666242,0.6688,0.668536,0.66392
Accuracy,0.748172,0.56713,0.613329,0.659657,0.56836,0.677904,0.632934,0.674798,0.661016
Precision,0.105853,0.0793,0.084928,0.091548,0.075064,0.091329,0.086178,0.091427,0.088471
Recall,0.584594,0.77304,0.738652,0.698762,0.722146,0.65337,0.708391,0.661623,0.667125
F1,0.179249,0.143844,0.15234,0.161887,0.135993,0.160256,0.153663,0.160655,0.156225
Approval Rate,0.949919,0.941896,0.945713,0.952507,0.948625,0.956842,0.949337,0.940278,0.939178
Kickout,0.0,0.113171,0.011629,0.013047,-0.001418,-0.004963,0.014465,0.01546,0.045808
KG,0.0,60.0,8.0,6.0,2.0,7.0,4.0,27.0,33.0


In [28]:
sum([hist_dict[i] for i in range(N_splits)])/N_splits

Unnamed: 0,BM,A-SC,A-UW,A-DW,A-FU,E-C,PAR,LSP,TN
Overall AUC,0.722625,0.715979,0.721911,0.720934,0.692366,0.720626,0.717786,0.71437,0.70666
KS,0.33151,0.325428,0.335022,0.338783,0.288826,0.333008,0.326215,0.326235,0.311676
Balanced Accuracy,0.660418,0.654022,0.662158,0.664135,0.636667,0.659056,0.655866,0.659562,0.651115
Accuracy,0.661783,0.629304,0.652778,0.650703,0.644133,0.65762,0.702689,0.684272,0.70004
Precision,0.09271,0.085736,0.089995,0.090997,0.084592,0.090773,0.097775,0.09337,0.094196
Recall,0.658916,0.681287,0.672474,0.678944,0.62825,0.660619,0.603871,0.632237,0.596887
F1,0.161633,0.151914,0.158488,0.160061,0.14823,0.159048,0.16701,0.162568,0.162333
Approval Rate,0.93977,0.924286,0.931792,0.934725,0.917715,0.940122,0.937247,0.933809,0.928061
Kickout,0.0,0.098785,0.020016,0.006499,-0.0005,0.009214,0.003651,0.049344,0.122397
KG,0.0,40.4,6.8,4.4,5.2,4.8,4.2,24.6,95.0


In [21]:
# add topsis/ahp multicriterio