In [1]:
from datetime import datetime
from itertools import combinations
import os
import re

import pandas as pd
import polars as pl
import numpy as np
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
import sklearn as sk
import mlflow
from feature_engine.encoding import CountFrequencyEncoder
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import data_proc as dp

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [2]:
base, X, y = dp.load_data('data/train)

In [3]:
X.shape

(1526659, 513)

In [10]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)
X, _ = dp.remove_single_val_cols(X, cat_cols_base)

bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L


In [4]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)
dp.fillna_single_val_cols(X, cat_cols_base)

bankacctype_710L
isdebitcard_729L
paytype1st_925L
paytype_783L
typesuite_864L
min_isbidproduct_390L
max_contaddr_matchlist_1032L
max_remitter_829L
min_contaddr_matchlist_1032L
min_remitter_829L
first_contaddr_matchlist_1032L
last_contaddr_matchlist_1032L
last_remitter_829L


In [6]:
cf = CountFrequencyEncoder(encoding_method='count', missing_values='ignore')
X = cf.fit_transform(X)



In [4]:
def reduce_group(grps):
    """picks the features with the largest number of unique values from each group"""
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = X[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    """returns groups of correlated features"""
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

In [5]:
def drop_corr_keep_best(X):
    nans_df = X[num_cols_base].isna()
    nans_groups={}
    for col in num_cols_base:
        cur_group = nans_df[col].sum()
        try:
            nans_groups[cur_group].append(col)
        except:
            nans_groups[cur_group]=[col]
    
    uses=[]
    for k,v in nans_groups.items():
        if len(v)>1:
                Vs = nans_groups[k]
                #cross_features=list(combinations(Vs, 2))
                #make_corr(Vs)
                grps= group_columns_by_correlation(X[Vs], threshold=0.8)
                use=reduce_group(grps)
                uses=uses+use
                #make_corr(use)
        else:
            uses=uses+v
        print('####### NAN count =',k)
    print(uses)
    print(len(uses))
    uses=uses+cat_cols_base
    print(len(uses))
    return X[uses]

In [5]:
 for col in cat_cols_base:
    if X[col].nunique() == 1:
        print(col)

NameError: name 'cat_cols_base' is not defined

In [None]:
X['bankacctype_710L'].describe()

In [None]:
X.loc[X['bankacctype_710L'].isnull(), 'bankacctype_710L']

In [None]:
cf = CountFrequencyEncoder(encoding_method='count', missing_values='ignore')
X = cf.fit_transform(X)

In [6]:
X['bankacctype_710L']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
1526654     CA
1526655     CA
1526656     CA
1526657     CA
1526658     CA
Name: bankacctype_710L, Length: 1526659, dtype: category
Categories (1, object): ['CA']

In [7]:
REMOTE_TRACKING_IP = os.getenv("REMOTE_IP", "localhost")
MLFLOW_TRACKING_URI = f"http://{REMOTE_TRACKING_IP}:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

CREDIT_EXPERIMENT_NAME = "credit-score-encode"
EXPERIMENT_NAME = "chosen-models-credit"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(CREDIT_EXPERIMENT_NAME)

MlflowException: API request to http://localhost:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=credit-score-encode (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000147C1CF9D50>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [None]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

In [15]:
X = X.drop(columns=[x for x in num_cols_base if 'num_group' in x])
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)

In [16]:
X = drop_corr_keep_best(X)

Use these ['month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D', 'max_personindex_1023L', 'min_personindex_1023L', 'min_persontype_1072L', 'min_persontype_792L', 'first_personindex_1023L', 'first_persontype_1072L', 'first_persontype_792L']
####### NAN count = 0
####### NAN count = 1389663
U

In [17]:
X.shape

(1526659, 395)

# Experiment 1
Count encoded categorical variables

In [18]:
train_data = lgb.Dataset(X, label=y)

In [19]:
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 10 rounds
[50]	cv_agg's valid auc: 0.781883 + 0.00173996
[100]	cv_agg's valid auc: 0.801726 + 0.00177195
[150]	cv_agg's valid auc: 0.810108 + 0.0016163
[200]	cv_agg's valid auc: 0.814942 + 0.00166265
[250]	cv_agg's valid auc: 0.818165 + 0.00162424
[300]	cv_agg's valid auc: 0.820434 + 0.00157116
[350]	cv_agg's valid auc: 0.822102 + 0.00159375
[400]	cv_agg's valid auc: 0.823473 + 0.00154553
[450]	cv_agg's valid auc: 0.824595 + 0.00152416
[500]	cv_agg's valid auc: 0.825565 + 0.00144061
[550]	cv_agg's valid auc: 0.82644 + 0.00140071
[600]	cv_agg's valid auc: 0.827106 + 0.00143827
[650]	cv_agg's valid auc: 0.827807 + 0.0013451
[700]	cv_agg's valid auc: 0.828372 + 0.00132421
[750]	cv_agg's valid auc: 0.828979 + 0.00138693
[800]	cv_agg's valid auc: 0.829428 + 0.00137722
[850]	cv_agg's valid auc: 0.829783 + 0.00127941
[900]	cv_agg's valid auc: 0.830135 + 0.00131502
[950]	cv_agg's valid auc: 0.830529 + 0.00133683
[1000]	cv_agg's valid auc: 0.83

In [13]:
res

{'valid auc-mean': [0.6783768462509188,
  0.6918957700127772,
  0.7068684052468015,
  0.7110045116635759,
  0.712435085512945,
  0.7184120294756394,
  0.7207062156448399,
  0.7246273642708612,
  0.7256670121938682,
  0.7274982650780168,
  0.7286789127001783,
  0.7308341052377331,
  0.7321998215284199,
  0.73462372032797,
  0.7369497446477673,
  0.738628099138922,
  0.7410971014341392,
  0.7424552187562192,
  0.7451996747472812,
  0.7464279231553814,
  0.7476746663824805,
  0.7495284715276217,
  0.7510882348256611,
  0.7525743782586529,
  0.7538183104758585,
  0.75512327220123,
  0.7567150547316701,
  0.7577701139042776,
  0.7590212676939714,
  0.7601298444555983,
  0.7612642311867803,
  0.7626124594902463,
  0.7639162851992249,
  0.7652382545828746,
  0.7664566340927353,
  0.7677796211721699,
  0.7686172152027337,
  0.7694749305992575,
  0.770353550039848,
  0.7715699760238743,
  0.7723122347783642,
  0.7731874499569489,
  0.7741181220538358,
  0.7752694647602468,
  0.775832803627306,


In [18]:
preds = res['cvbooster'].predict(X)

In [28]:
eval_result['valid']['auc-mean'][-1]

0.8312394807888728

In [23]:
np.array(preds).mean(axis=0)

array([0.03450705, 0.04219264, 0.03817727, ..., 0.03775096, 0.00240081,
       0.00561591])

In [24]:
preds

[array([0.0349863 , 0.04031065, 0.04016298, ..., 0.03244252, 0.00200471,
        0.0051642 ]),
 array([0.03476954, 0.04475904, 0.04148933, ..., 0.04290371, 0.00320878,
        0.00549897]),
 array([0.03508357, 0.03954467, 0.03750932, ..., 0.03672016, 0.00223579,
        0.00525933]),
 array([0.03091966, 0.04353575, 0.03481438, ..., 0.04257432, 0.00197837,
        0.00587881]),
 array([0.03677617, 0.04281308, 0.03691032, ..., 0.03411408, 0.0025764 ,
        0.00627824])]

In [4]:
from feature_engine.selection import SelectByInformationValue
iv = SelectByInformationValue()
iv.fit(X, y)

ValueError: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer.

In [None]:
iv.get_feature_names_out()

# Experiment 2
fillna categorical

In [5]:
base, X, y = dp.load_data()

In [6]:
cat_cols_base = list(X.select_dtypes("category").columns)
num_cols_base = list(X.select_dtypes(exclude="category").columns)

In [7]:
def fill_employedtotal(x):
    if x == 'LESS_ONE':
        return 1
    if x == 'MORE_ONE':
        return 2
    if x == 'MORE_FIVE':
        return 3

In [8]:
for col in [x for x in cat_cols_base if 'employedtotal' in x]:
    X[col] = X[col].apply(fill_employedtotal).astype(float).fillna(0).astype(int)

X = X.drop(columns=['last_contaddr_matchlist_1032L'])
cat_cols_base = list(X.select_dtypes("category").columns)

In [9]:
fill_empty = ['housetype',
'bankacctype',
'credacc_status',
'relationshiptoclient',
'remitter',
'familystate',
'cardtype',
'typesuite',
'empl_industry',
'sex',
'contaddr_smempladdr',
'requesttype',
'incometype',
'credtype',
'inittransactioncode',
'disbursement',
'type_25L',
'role_1084L',
'maritalst',
'description',
'education',
'opencred',
'paytype',
'rejectreason',
'cancelreason',
'postype',
'lastst',
'twobodfilling',
'contaddr_matchlist',
'status_'
]

In [10]:
for pat in fill_empty:
    print(pat)
    for col in [x for x in cat_cols_base if pat in x]:
        if '' not in X[col].cat.categories:
            X[col] = X[col].cat.add_categories('')
        X[col] = X[col].fillna('')

housetype
bankacctype
credacc_status
relationshiptoclient
remitter
familystate
cardtype
typesuite
empl_industry
sex
contaddr_smempladdr
requesttype
incometype
credtype
inittransactioncode
disbursement
type_25L
role_1084L
maritalst
description
education
opencred
paytype
rejectreason
cancelreason
postype
lastst
twobodfilling
contaddr_matchlist
status_


In [11]:
fill_false = ['equality',
'isdebitcard',
'safeguaranty',
'isbidproduct']

In [12]:
for pat in fill_false:
    print(pat)
    for col in [x for x in cat_cols_base if pat in x]:
        X[col] = X[col].fillna(False)

equality
isdebitcard
safeguaranty
isbidproduct


In [18]:
X = drop_corr_keep_best(X)

Use these ['month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D', 'max_num_group1_1_7', 'min_personindex_1023L', 'min_persontype_1072L', 'min_persontype_792L', 'first_personindex_1023L', 'first_persontype_1072L', 'first_persontype_792L', 'min_num_group1_1_7', 'first_num_group1_1_7']
####### 

In [19]:
cat_cols_base = list(X.select_dtypes("category").columns)

(1526659, 429)

In [22]:
train_data = lgb.Dataset(X, label=y, categorical_feature=cat_cols_base)
eval_result = {}
res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)], return_cvbooster=True)



Training until validation scores don't improve for 10 rounds
[50]	cv_agg's valid auc: 0.782462 + 0.00187951
[100]	cv_agg's valid auc: 0.802313 + 0.00170282
[150]	cv_agg's valid auc: 0.810592 + 0.00160996
[200]	cv_agg's valid auc: 0.815482 + 0.00150618
[250]	cv_agg's valid auc: 0.818714 + 0.0015342
[300]	cv_agg's valid auc: 0.821062 + 0.0014905
[350]	cv_agg's valid auc: 0.822843 + 0.00149771
[400]	cv_agg's valid auc: 0.824363 + 0.0015059
[450]	cv_agg's valid auc: 0.825545 + 0.00146628
[500]	cv_agg's valid auc: 0.826529 + 0.0013829
[550]	cv_agg's valid auc: 0.827413 + 0.0013107
[600]	cv_agg's valid auc: 0.828093 + 0.00132087
[650]	cv_agg's valid auc: 0.828754 + 0.00126335
[700]	cv_agg's valid auc: 0.829355 + 0.00120439
[750]	cv_agg's valid auc: 0.829951 + 0.00120908
[800]	cv_agg's valid auc: 0.830423 + 0.00122281
[850]	cv_agg's valid auc: 0.830783 + 0.00117331
[900]	cv_agg's valid auc: 0.831116 + 0.00120808
[950]	cv_agg's valid auc: 0.831506 + 0.00126954
[1000]	cv_agg's valid auc: 0.8318

In [23]:
eval_result['valid']['auc-mean'][-1]

0.8318173740952505

In [25]:
list(X.columns)

['month_decision',
 'weekday_decision',
 'credamount_770A',
 'applicationcnt_361L',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_867L',
 'clientscnt_1022L',
 'clientscnt_100L',
 'clientscnt_1071L',
 'clientscnt_1130L',
 'clientscnt_157L',
 'clientscnt_257L',
 'clientscnt_304L',
 'clientscnt_360L',
 'clientscnt_493L',
 'clientscnt_533L',
 'clientscnt_887L',
 'clientscnt_946L',
 'deferredmnthsnum_166L',
 'disbursedcredamount_1113A',
 'downpmt_116A',
 'homephncnt_628L',
 'mobilephncnt_593L',
 'numactivecreds_622L',
 'numactivecredschannel_414L',
 'numactiverelcontr_750L',
 'numcontrs3months_479L',
 'numnotactivated_1143L',
 'numpmtchanneldd_318L',
 'numrejects9m_859L',
 'sellerplacecnt_915L',
 'max_mainoccupationinc_384A',
 'max_birth_259D',
 'max_num_group1_1_7',
 'min_personindex_1023L',
 'min_persontype_1072L',
 'min_persontype_792L',
 'first_personindex_1023L',
 'first_persontype_1072L',
 'first_persontype_792L',
 'min_num_group1_1_7',


In [27]:
REMOTE_TRACKING_IP = os.getenv("REMOTE_IP", "localhost")
MLFLOW_TRACKING_URI = f"http://{REMOTE_TRACKING_IP}:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
EXPERIMENT_NAME = "credit-score-tuning"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/775714010158325685', creation_time=1713352331682, experiment_id='775714010158325685', last_update_time=1713352331682, lifecycle_stage='active', name='credit-score-tuning', tags={}>

In [28]:
train_data = lgb.Dataset(X, label=y, categorical_feature=cat_cols_base, free_raw_data=False)

In [32]:
def hyperopt_search(train_data):
    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "lgb-cat-fill")
            mlflow.log_params(params)
            eval_result = {}
            res = lgb.cv(params, train_data, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
            auc = eval_result['valid']['auc-mean'][-1]
            mlflow.log_metric("auc-eval", auc)

        return {"loss": -auc, "status": STATUS_OK}

    search_space = {
        'feature_pre_filter': hp.choice('feature_pre_filter', [False]),
        'objective': hp.choice('objective', ['binary']),
        'metric': hp.choice('metric', ['auc']),
        'feature_fraction': hp.uniform('feature_fraction', 0.5, 0.9),
        'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 0.9),
        'bagging_freq': hp.choice('bagging_freq', [5]),
        'n_estimators': hp.choice('n_estimators', [2000]),
        'verbose': hp.choice('verbose', [-1]),
        'learning_rate': hp.choice('learning_rate', [0.05, 0.03]),
        'num_leaves': hp.randint('num_leaves', 30, 150), 
        'min_child_samples': hp.randint('min_child_samples', 100, 500), 
        'min_child_weight': hp.loguniform('min_child_weight', -5, 4),
        'subsample': hp.uniform('subsample', 0.6, 1), 
        'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 1),
        'reg_alpha': hp.choice('reg_alpha', [0, 1e-1, 1, 2, 5, 7, 10, 100]),
        'reg_lambda': hp.choice('reg_lambda', [10, 20, 100, 200, 500, 1000]),
        'max_depth': hp.choice('max_depth', [7, 11, 13, 15, 20])
    }

    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=100,
        trials=Trials(),
    )

    print(best_result)

# hyperopt_search(train_data)

# Experiment 3
Fillna all data