In [18]:
from datetime import datetime
from itertools import combinations
import os
import re

import pandas as pd
import polars as pl
import numpy as np
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
import sklearn as sk
import mlflow
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import data_proc as dp

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [2]:
REMOTE_TRACKING_IP = os.getenv("REMOTE_IP", "localhost")
MLFLOW_TRACKING_URI = f"http://{REMOTE_TRACKING_IP}:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

CREDIT_EXPERIMENT_NAME = "credit-score-cv"
EXPERIMENT_NAME = "chosen-models-credit"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(CREDIT_EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/218050481937314459', creation_time=1713079301255, experiment_id='218050481937314459', last_update_time=1713079301255, lifecycle_stage='active', name='credit-score-cv', tags={}>

In [6]:
(base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
cat_cols_base = list(X_train.select_dtypes("category").columns)

In [7]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base, free_raw_data=False)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base, free_raw_data=False)

In [3]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    higher_better = True
    metric_name="gini_stability"
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [12]:
def experiment1(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)
            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))
        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [6]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}


# Experiment 1
Cleaned dataset, categorical variables with 1 value and highly correlated numerical features (pearson > 0.95) removed
rare cat values groupped into 'other' value

In [10]:

bst, eval_res, test_auc, test_gini = experiment1(params)



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.781355
[100]	eval's auc: 0.799777
[150]	eval's auc: 0.80836
[200]	eval's auc: 0.813556
[250]	eval's auc: 0.816771
[300]	eval's auc: 0.819447
[350]	eval's auc: 0.821402
[400]	eval's auc: 0.822754
[450]	eval's auc: 0.82401
[500]	eval's auc: 0.824911
[550]	eval's auc: 0.82566
[600]	eval's auc: 0.826497
[650]	eval's auc: 0.826989
[700]	eval's auc: 0.827606
[750]	eval's auc: 0.828107
[800]	eval's auc: 0.828609
[850]	eval's auc: 0.829059
[900]	eval's auc: 0.829393
[950]	eval's auc: 0.829766
[1000]	eval's auc: 0.83005
Did not meet early stopping. Best iteration is:
[1000]	eval's auc: 0.83005




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.779982
[100]	eval's auc: 0.800477
[150]	eval's auc: 0.808933
[200]	eval's auc: 0.814302
[250]	eval's auc: 0.817844
[300]	eval's auc: 0.820429
[350]	eval's auc: 0.822063
[400]	eval's auc: 0.823757
[450]	eval's auc: 0.824961
[500]	eval's auc: 0.826173
[550]	eval's auc: 0.826986
[600]	eval's auc: 0.827776
[650]	eval's auc: 0.828443
[700]	eval's auc: 0.829032
[750]	eval's auc: 0.829492
[800]	eval's auc: 0.830006
[850]	eval's auc: 0.830404
[900]	eval's auc: 0.830812
[950]	eval's auc: 0.831125
[1000]	eval's auc: 0.83133
Did not meet early stopping. Best iteration is:
[1000]	eval's auc: 0.83133




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.783786
[100]	eval's auc: 0.803775
[150]	eval's auc: 0.813076
[200]	eval's auc: 0.817836
[250]	eval's auc: 0.821221
[300]	eval's auc: 0.82369
[350]	eval's auc: 0.825415
[400]	eval's auc: 0.827041
[450]	eval's auc: 0.828363
[500]	eval's auc: 0.829567
[550]	eval's auc: 0.830244
[600]	eval's auc: 0.830963
[650]	eval's auc: 0.831681
[700]	eval's auc: 0.832405
[750]	eval's auc: 0.832781
Early stopping, best iteration is:
[765]	eval's auc: 0.832894




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.782092
[100]	eval's auc: 0.801257
[150]	eval's auc: 0.80969
[200]	eval's auc: 0.814669
[250]	eval's auc: 0.817888
[300]	eval's auc: 0.820331
[350]	eval's auc: 0.821853
[400]	eval's auc: 0.823184
[450]	eval's auc: 0.824253
[500]	eval's auc: 0.825225
[550]	eval's auc: 0.826043
[600]	eval's auc: 0.826795
[650]	eval's auc: 0.827368
[700]	eval's auc: 0.827869
[750]	eval's auc: 0.828306
[800]	eval's auc: 0.82874
[850]	eval's auc: 0.829333
[900]	eval's auc: 0.829667
[950]	eval's auc: 0.829909
[1000]	eval's auc: 0.830201
Did not meet early stopping. Best iteration is:
[999]	eval's auc: 0.830209




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.782582
[100]	eval's auc: 0.80152
[150]	eval's auc: 0.810042
[200]	eval's auc: 0.815071
[250]	eval's auc: 0.818453
[300]	eval's auc: 0.820836
[350]	eval's auc: 0.822664
[400]	eval's auc: 0.824102
[450]	eval's auc: 0.825311
[500]	eval's auc: 0.82634
[550]	eval's auc: 0.827389
[600]	eval's auc: 0.828068
[650]	eval's auc: 0.828669
[700]	eval's auc: 0.829427
[750]	eval's auc: 0.829987
[800]	eval's auc: 0.830447
[850]	eval's auc: 0.830868
[900]	eval's auc: 0.831314
[950]	eval's auc: 0.831824
[1000]	eval's auc: 0.832109
Did not meet early stopping. Best iteration is:
[998]	eval's auc: 0.83211
results:  0.8313144765244568 0.8306314690501406 0.6172219369994861


In [14]:
with mlflow.start_run():
    mlflow.log_dict(
        {
            "eval_auc": eval_res,
            "test_auc": test_auc,
            "test_gini": test_gini
        },  
        "experiment1.json"
    )

# Experiment 2
Check if removing cat columns with 1 value improve anything

In [5]:
def experiment2(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)
            
            X_train, cols_to_rm = dp.remove_single_val_cols(X_train, cat_cols_base)
            X_valid = X_valid.drop(columns=cols_to_rm)
            X_test = X_test.drop(columns=cols_to_rm)
            cat_cols_base = list(X_train.select_dtypes("category").columns)

            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)

            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))

        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        mlflow.log_dict(
            {
                "eval_auc": eval_res,
                "test_auc": test_auc,
                "test_gini": test_gini,
                "description": "Check if removing cat columns with 1 value improve anything"
            },  
            "experiment2.json"
        )
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [6]:
bst, _, _, _ = experiment2(params)

bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.780072
[100]	eval's auc: 0.800172
[150]	eval's auc: 0.808707
[200]	eval's auc: 0.814084
[250]	eval's auc: 0.817302
[300]	eval's auc: 0.81967
[350]	eval's auc: 0.821319
[400]	eval's auc: 0.822917
[450]	eval's auc: 0.824217
[500]	eval's auc: 0.825172
[550]	eval's auc: 0.826007
[600]	eval's auc: 0.826769
[650]	eval's auc: 0.827453
[700]	eval's auc: 0.827847
[750]	eval's auc: 0.828356
[800]	eval's auc: 0.828942
[850]	eval's auc: 0.829369
[900]	eval's auc: 0.829631
[950]	eval's auc: 0.829935
[1000]	eval's auc: 0.830177
Did not meet early stopping. Best iteration is:
[999]	eval's auc: 0.830179
bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.778786
[100]	eval's auc: 0.798706
[150]	eval's auc: 0.80781
[200]	eval's auc: 0.813254
[250]	eval's auc: 0.816928
[300]	eval's auc: 0.819283
[350]	eval's auc: 0.820988
[400]	eval's auc: 0.822538
[450]	eval's auc: 0.823685
[500]	eval's auc: 0.824876
[550]	eval's auc: 0.82579
[600]	eval's auc: 0.826604
[650]	eval's auc: 0.827358
[700]	eval's auc: 0.828001
[750]	eval's auc: 0.828524
[800]	eval's auc: 0.828994
[850]	eval's auc: 0.829401
Early stopping, best iteration is:
[841]	eval's auc: 0.829401
bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.779748
[100]	eval's auc: 0.799445
[150]	eval's auc: 0.808222
[200]	eval's auc: 0.813553
[250]	eval's auc: 0.816794
[300]	eval's auc: 0.819069
[350]	eval's auc: 0.820944
[400]	eval's auc: 0.8224
[450]	eval's auc: 0.823538
[500]	eval's auc: 0.824584
[550]	eval's auc: 0.825515
[600]	eval's auc: 0.826261
[650]	eval's auc: 0.82671
[700]	eval's auc: 0.8273
[750]	eval's auc: 0.82791
[800]	eval's auc: 0.828338
[850]	eval's auc: 0.828772
[900]	eval's auc: 0.829077
Early stopping, best iteration is:
[925]	eval's auc: 0.829179
bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.784699
[100]	eval's auc: 0.803109
[150]	eval's auc: 0.811466
[200]	eval's auc: 0.816602
[250]	eval's auc: 0.81994
[300]	eval's auc: 0.822541
[350]	eval's auc: 0.824467
[400]	eval's auc: 0.82603
[450]	eval's auc: 0.827215
[500]	eval's auc: 0.828251
[550]	eval's auc: 0.829142
[600]	eval's auc: 0.829939
[650]	eval's auc: 0.830759
[700]	eval's auc: 0.831367
[750]	eval's auc: 0.831941
[800]	eval's auc: 0.832212
[850]	eval's auc: 0.832649
[900]	eval's auc: 0.83303
Early stopping, best iteration is:
[904]	eval's auc: 0.833056
bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L




Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.786084
[100]	eval's auc: 0.80605
[150]	eval's auc: 0.815478
[200]	eval's auc: 0.820222
[250]	eval's auc: 0.823494
[300]	eval's auc: 0.825564
[350]	eval's auc: 0.827262
[400]	eval's auc: 0.828701
[450]	eval's auc: 0.829567
[500]	eval's auc: 0.830431
[550]	eval's auc: 0.831335
[600]	eval's auc: 0.832063
[650]	eval's auc: 0.832621
[700]	eval's auc: 0.833294
[750]	eval's auc: 0.833695
[800]	eval's auc: 0.834053
[850]	eval's auc: 0.834435
[900]	eval's auc: 0.834824
[950]	eval's auc: 0.835163
[1000]	eval's auc: 0.835622
Did not meet early stopping. Best iteration is:
[993]	eval's auc: 0.835642
results:  0.831485398965562 0.8296554890581076 0.6224366142149258


# Experiment 3
See if thowing rare category values into "others" value improves anything

In [10]:
def experiment3(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)
            
            dp.rare_values_to_others(X_train, cat_cols_base)
            dp.rare_values_to_others(X_valid, cat_cols_base)
            dp.rare_values_to_others(X_test, cat_cols_base)
            
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)

            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))

        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        mlflow.log_dict(
            {
                "eval_auc": eval_res,
                "test_auc": test_auc,
                "test_gini": test_gini,
                "description": "See if thowing rare category values into others value improves anything"
            },  
            "experiment3.json"
        )
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [11]:
bst, _, _, _ = experiment3(params)

lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P151_9_55
lastrejectcommodtypec_5251769M P34_45_168
lastrejectreason_759M P185_59_178
lastrejectreasonclient_4145040M P59_114_135
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejectreasonclient_4145042M P30_86_84
min_cancelreason_3545846M P91_110_150
min_rejectreasonclient_4145042M P59_114_135
first_cancelreason_3545846M P91_110_150
first_rejectreason_755M P185_59_178
first_rejectreasonclient_4145042M P59_114_135
last_cancelreason_3545846M P32_86_86
last_rejectreason_755M P53_10_15
last_rejectreasonclient_4145042M P64_121_167
mode_cancelreason_3545846M P32_86_86
mode_rejectreason_755M P69_72_116
mode_rejectreasonclient_4145042M P59_114_135
last_status_219L P
max_empladdr_district_926M P121_51_168
max_empladdr_zipcode_114M P129_32_166
last_empladdr_district_926M P121_51_168
last_empladdr_zipcode_114M P129_32_166
max_empl_industry_691L ARMY_POLICE
max_incometyp



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.791492
[100]	eval's auc: 0.809268
[150]	eval's auc: 0.81678
[200]	eval's auc: 0.821759
[250]	eval's auc: 0.824892
[300]	eval's auc: 0.827044
[350]	eval's auc: 0.828732
[400]	eval's auc: 0.830055
[450]	eval's auc: 0.831161
[500]	eval's auc: 0.832029
[550]	eval's auc: 0.832774
[600]	eval's auc: 0.833723
[650]	eval's auc: 0.834338
[700]	eval's auc: 0.835007
[750]	eval's auc: 0.835474
Early stopping, best iteration is:
[785]	eval's auc: 0.835733
lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P38_69_128
lastrejectcommodtypec_5251769M P184_3_97
lastrejectreasonclient_4145040M P204_22_168
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejectreasonclient_4145042M P30_86_84
min_cancelreason_3545846M P91_110_150
min_rejectreason_755M P185_59_178
min_rejectreasonclient_4145042M P59_114_135
first_cancelreason_3545846M P91_110_150
first_reje



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.784082
[100]	eval's auc: 0.802896
[150]	eval's auc: 0.811722
[200]	eval's auc: 0.816431
[250]	eval's auc: 0.819616
[300]	eval's auc: 0.821991
[350]	eval's auc: 0.823707
[400]	eval's auc: 0.825191
[450]	eval's auc: 0.826384
[500]	eval's auc: 0.82742
[550]	eval's auc: 0.828158
[600]	eval's auc: 0.82889
[650]	eval's auc: 0.829691
[700]	eval's auc: 0.830196
[750]	eval's auc: 0.830777
[800]	eval's auc: 0.831148
[850]	eval's auc: 0.831568
[900]	eval's auc: 0.831988
[950]	eval's auc: 0.832351
[1000]	eval's auc: 0.832661
Did not meet early stopping. Best iteration is:
[1000]	eval's auc: 0.832661
lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P38_69_128
lastrejectcommodtypec_5251769M P185_44_170
lastrejectreason_759M P185_59_178
lastrejectreasonclient_4145040M P204_22_168
lastst_736L P
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejec



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.784582
[100]	eval's auc: 0.804157
[150]	eval's auc: 0.812941
[200]	eval's auc: 0.818459
[250]	eval's auc: 0.821803
[300]	eval's auc: 0.823858
[350]	eval's auc: 0.825949
[400]	eval's auc: 0.827341
[450]	eval's auc: 0.828423
[500]	eval's auc: 0.829479
[550]	eval's auc: 0.830347
[600]	eval's auc: 0.83109
[650]	eval's auc: 0.831771
[700]	eval's auc: 0.83231
[750]	eval's auc: 0.832836
[800]	eval's auc: 0.833489
[850]	eval's auc: 0.833841
[900]	eval's auc: 0.834125
[950]	eval's auc: 0.834446
[1000]	eval's auc: 0.834717
Did not meet early stopping. Best iteration is:
[1000]	eval's auc: 0.834717
lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P151_9_55
lastrejectcommodtypec_5251769M P34_45_168
lastrejectreason_759M P185_59_178
lastrejectreasonclient_4145040M P59_114_135
lastst_736L R
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejectr



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.783351
[100]	eval's auc: 0.80223
[150]	eval's auc: 0.810932
[200]	eval's auc: 0.81569
[250]	eval's auc: 0.818786
[300]	eval's auc: 0.821033
[350]	eval's auc: 0.822933
[400]	eval's auc: 0.824201
[450]	eval's auc: 0.825317
[500]	eval's auc: 0.82615
[550]	eval's auc: 0.826885
[600]	eval's auc: 0.827529
[650]	eval's auc: 0.828113
[700]	eval's auc: 0.828555
[750]	eval's auc: 0.829021
[800]	eval's auc: 0.829356
[850]	eval's auc: 0.829647
[900]	eval's auc: 0.829936
[950]	eval's auc: 0.830292
[1000]	eval's auc: 0.830595
Did not meet early stopping. Best iteration is:
[999]	eval's auc: 0.830596
lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P38_69_128
lastrejectcommodtypec_5251769M P132_130_54
lastrejectreason_759M P185_59_178
lastrejectreasonclient_4145040M P204_22_168
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejectreasonclient_41



Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.782453
[100]	eval's auc: 0.802037
[150]	eval's auc: 0.810355
[200]	eval's auc: 0.815399
[250]	eval's auc: 0.818757
[300]	eval's auc: 0.821218
[350]	eval's auc: 0.822865
[400]	eval's auc: 0.82454
[450]	eval's auc: 0.825585
[500]	eval's auc: 0.826543
[550]	eval's auc: 0.827338
[600]	eval's auc: 0.828133
[650]	eval's auc: 0.828672
[700]	eval's auc: 0.829277
[750]	eval's auc: 0.829879
[800]	eval's auc: 0.830219
[850]	eval's auc: 0.830579
[900]	eval's auc: 0.830967
[950]	eval's auc: 0.831361
Early stopping, best iteration is:
[955]	eval's auc: 0.831401
results:  0.8330088906717226 0.8298610780110138 0.6124787789834503


# Experiment 4
remove cat cols with 1 value + move rare occurences to one value + remove highly correlated features

In [21]:
def experiment4(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)

            X_train, cols_to_rm = dp.remove_single_val_cols(X_train, cat_cols_base)
            X_valid = X_valid.drop(columns=cols_to_rm)
            X_test = X_test.drop(columns=cols_to_rm)
            cat_cols_base = list(X_train.select_dtypes("category").columns)

            dp.rare_values_to_others(X_train, cat_cols_base)
            dp.rare_values_to_others(X_valid, cat_cols_base)
            dp.rare_values_to_others(X_test, cat_cols_base)

            num_cols_base = list(X_train.select_dtypes(include=np.number).columns)
            combs = list(combinations(num_cols_base, 2))
            corrs = dp.numeric_cols_correlation_check(X_train, combs)
            cols_to_rm = dp.get_corr_cols_to_rm(X_train, corrs)

            X_train = X_train.drop(columns=cols_to_rm)
            X_valid = X_valid.drop(columns=cols_to_rm)
            X_test = X_test.drop(columns=cols_to_rm)

            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)
            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))
        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        mlflow.log_dict(
            {
                "eval_auc": eval_res,
                "test_auc": test_auc,
                "test_gini": test_gini,
                "description": "remove cat cols with 1 value + move rare occurences to one value + remove highly correlated features"
            },  
            "experiment4.json"
        )
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [None]:
bst, _, _, _ = experiment4(params)

bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L
lastapprcommoditycat_1041M P201_108_190
lastcancelreason_561M P91_110_150
lastrejectcommoditycat_161M P38_69_128
lastrejectcommodtypec_5251769M P185_44_170
lastrejectreason_759M P185_59_178
lastrejectreasonclient_4145040M P59_114_135
max_cancelreason_3545846M P32_86_86
max_rejectreason_755M P53_10_15
max_rejectreasonclient_4145042M P30_86_84
min_cancelreason_3545846M P91_110_150
min_rejectreasonclient_4145042M P59_114_135
first_cancelreason_3545846M P91_110_150
first_rejectreason_755M P185_59_178
first_rejectreasonclient_4145042M P59_114_135
last_cancelreason_3545846M P91_110_150
last_rejectreason_755M P185_59_178
last_rejectreasonclient_4145042M P64_121_167
mode_cancelreason_3545846M P91_110_150
mode_rejectreason_75

  corr = pearsonr(data[comb[0]].fillna(0), data[comb[1]].fillna(0))[0]


Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.779104
[100]	eval's auc: 0.797639
[150]	eval's auc: 0.805495
[200]	eval's auc: 0.810642
[250]	eval's auc: 0.813462
[300]	eval's auc: 0.815863
[350]	eval's auc: 0.817836
[400]	eval's auc: 0.819394
[450]	eval's auc: 0.820559
[500]	eval's auc: 0.821555
[550]	eval's auc: 0.822552
[600]	eval's auc: 0.823175
[650]	eval's auc: 0.823879
[700]	eval's auc: 0.824403
[750]	eval's auc: 0.824888
[800]	eval's auc: 0.825176
[850]	eval's auc: 0.825574
Early stopping, best iteration is:
[860]	eval's auc: 0.825654


# Experiment 5
fill several missing columns with a minimal value 

In [None]:
def experiment5(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)
            num_cols_base = list(X_train.select_dtypes(include=np.number).columns)
            
            dp.fill_min(X_train, num_cols_base)
            dp.fill_min(X_valid, num_cols_base)
            dp.fill_min(X_test, num_cols_base)
            
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)

            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))

        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        mlflow.log_dict(
            {
                "eval_auc": eval_res,
                "test_auc": test_auc,
                "test_gini": test_gini,
                "description": "fill several missing columns with a minimal value"
            },  
            "experiment5.json"
        )
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [None]:
bst, _, _, _ = experiment5(params)

# Experiment 6
fill missing columns with 0, where it seems to make sense

In [None]:
def experiment6(params) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        eval_res = []
        test_auc = []
        test_gini = []
        for i in range(5):
            (base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = dp.load_data_splits("data/train_base.parquet")
            cat_cols_base = list(X_train.select_dtypes("category").columns)
            num_cols_base = list(X_train.select_dtypes(include=np.number).columns)
            
            dp.fill_min(X_train, num_cols_base)
            dp.fill_min(X_valid, num_cols_base)
            dp.fill_min(X_test, num_cols_base)
            
            train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
            valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
            test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)

            bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            eval_res.append(eval_result['eval']['auc'][-1])
            
            ypred = bst.predict(X_test)
            base_test["score"] = ypred
            test_auc.append(roc_auc_score(y_test.values, ypred))
            test_gini.append(gini_stability(base_test))

        mlflow.log_metric("auc-eval", np.mean(eval_res))
        mlflow.log_metric("auc-test", np.mean(test_auc))
        mlflow.log_metric("gini-stability-test", np.mean(test_gini))
        mlflow.log_dict(
            {
                "eval_auc": eval_res,
                "test_auc": test_auc,
                "test_gini": test_gini,
                "description": "fill several missing columns with a 0"
            },  
            "experiment6.json"
        )
        print("results: ", np.mean(eval_res), np.mean(test_auc), np.mean(test_gini))
        return bst, eval_res, test_auc, test_gini

In [None]:
bst, _, _, _ = experiment6(params)

# Experiment 7
add missing value indicators

# Experiment 8
like experiment 5 but with missing value indicators

# Experiment 9
like experiment 6 but with missing value indicators

# Experiment 10
Try with some custom NA fills from kaggle noteook

# Experiment 11
add missing value indicators and impute missing values with knn