In [1]:
from datetime import datetime
import os
import re

import pandas as pd
import polars as pl
import numpy as np
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
import sklearn as sk
import mlflow
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

In [6]:
REMOTE_TRACKING_IP = os.getenv("REMOTE_IP", "localhost")
MLFLOW_TRACKING_URI = f"http://{REMOTE_TRACKING_IP}:5000"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

CREDIT_EXPERIMENT_NAME = "credit-score-train"
EXPERIMENT_NAME = "chosen-models-credit"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(CREDIT_EXPERIMENT_NAME)

2024/04/04 21:52:25 INFO mlflow.tracking.fluent: Experiment with name 'credit-score-train' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/667472224209157606', creation_time=1712260345326, experiment_id='667472224209157606', last_update_time=1712260345326, lifecycle_stage='active', name='credit-score-train', tags={}>

In [7]:
def get_splits(data: pl.DataFrame):
    case_ids = data[["case_id"]].unique()
    case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.7)
    case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5)
    return case_ids_train, case_ids_valid, case_ids_test

In [26]:
def from_polars_to_pandas(data, case_ids: pl.DataFrame, cat_cols=None) -> pd.DataFrame:
    base = data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas()
    X = data.filter(pl.col("case_id").is_in(case_ids))[[x for x in data.columns if x not in ["case_id", "WEEK_NUM", "target"]]].to_pandas()
    y = data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    if cat_cols is None:
        cat_cols = list(X.select_dtypes("object").columns)
    
    X[cat_cols] = X[cat_cols].astype("category")
    return base, X, y

In [27]:
def load_data_splits(data_path: str):
    data = pl.read_parquet(data_path, use_pyarrow = True)
    case_ids_train, case_ids_valid, case_ids_test = get_splits(data)
    return (
        from_polars_to_pandas(data, case_ids_train),
        from_polars_to_pandas(data, case_ids_valid),
        from_polars_to_pandas(data, case_ids_test)
    )

In [28]:
(base_train, X_train, y_train), (base_valid, X_valid, y_valid), (base_test, X_test, y_test) = load_data_splits("data/train_base.parquet")
cat_cols_base = list(X_train.select_dtypes("category").columns)

In [56]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols_base)
valid_data = lgb.Dataset(X_valid, label=y_valid, categorical_feature=cat_cols_base)
test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=cat_cols_base)

In [57]:
def train_lgb_model(params, train_data, valid_data) -> lgb.Booster:
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.log_params(params)
        eval_result = {}
        bst = lgb.train(params, train_data, valid_sets=valid_data, valid_names=["eval"], 
                        callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10), lgb.record_evaluation(eval_result)])
        mlflow.log_metric("auc", eval_result['eval']['auc'][-1])
        return bst, eval_result

In [58]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}


bst, eval_results = train_lgb_model(params, train_data, valid_data)

Training until validation scores don't improve for 10 rounds
[50]	eval's auc: 0.777952
[100]	eval's auc: 0.797256
[150]	eval's auc: 0.805721
[200]	eval's auc: 0.811403
[250]	eval's auc: 0.814745
[300]	eval's auc: 0.817435
[350]	eval's auc: 0.819253
[400]	eval's auc: 0.820796
[450]	eval's auc: 0.822087
[500]	eval's auc: 0.823132
[550]	eval's auc: 0.824052
[600]	eval's auc: 0.824725
[650]	eval's auc: 0.825416
[700]	eval's auc: 0.825934
Early stopping, best iteration is:
[736]	eval's auc: 0.826282


In [60]:
eval_results['eval']['auc'][-1]

0.8262716110570111

In [13]:
ypred = bst.predict(test_df[[x for x in test_df.columns if x != 'target']])

In [14]:
roc_auc_score(test_df['target'], ypred)

0.8303897059872279

In [21]:
f1_score(test_df['target'], ypred > 0.13)

0.22561422808947562

In [18]:
train_df['target']

0          0
1          0
2          0
3          0
4          1
          ..
1526654    0
1526655    0
1526656    0
1526657    0
1526658    0
Name: target, Length: 1526659, dtype: int64

In [17]:
np.percentile(ypred, 96)

0.13000668743092647

In [63]:
np.mean(train_df['target'])

0.03143711716845693

In [22]:
def get_lgbm_varimp(model, train_columns, max_vars=50):
    cv_varimp_df = pd.DataFrame([train_columns, model.feature_importance()]).T
    cv_varimp_df.columns = ['feature_name', 'varimp']
    cv_varimp_df.sort_values(by='varimp', ascending=False, inplace=True)
    cv_varimp_df = cv_varimp_df.iloc[0:max_vars]   
    return cv_varimp_df

In [23]:
imps = get_lgbm_varimp(bst, [x for x in train_df.columns if x != 'target'])

In [24]:
imps

Unnamed: 0,feature_name,varimp
0,case_id,117
36,annuity_780A,116
174,pmtnum_254L,114
178,price_1097A,113
290,min_cancelreason_3545846M,106
405,max_birth_259D,99
71,cntpmts24_3658933L,77
133,mobilephncnt_593L,74
434,max_incometype_1044T,62
86,eir_270L,62


In [57]:
test_data.get_label().shape

(305332,)

In [61]:
test_df.head()

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,datefirstoffer_1144D,datelastinstal40dpd_247D,datelastunpaid_3546854D,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,disbursementtype_67L,downpmt_116A,dtlastpmtallstes_4499206D,eir_270L,equalitydataagreement_891L,firstclxcampaign_1125D,firstdatedue_489D,homephncnt_628L,inittransactionamount_650A,inittransactioncode_186L,interestrate_311L,isbidproduct_1095L,isdebitcard_729L,lastactivateddate_801D,lastapplicationdate_877D,lastapprcommoditycat_1041M,lastapprcredamount_781A,lastapprdate_640D,lastcancelreason_561M,lastdelinqdate_224D,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectdate_50D,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstldate_3546855D,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,mobilephncnt_593L,monthsannuity_845L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numincomingpmts_3546848L,numinstlallpaidearly3d_817L,numinstls_657L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithdpd5_4187116L,numinstlswithoutdpd_562L,numinstmatpaidtearly2d_4499204L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstpaidearly3dest_4493216L,numinstpaidearly5d_1087L,numinstpaidearly5dest_4493211L,numinstpaidearly5dobd_4499205L,numinstpaidearly_338L,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D,max_actualdpd_943P,max_annuity_853A,max_credacc_actualbalance_314A,max_credacc_credlmt_575A,max_credacc_maxhisbal_375A,max_credacc_minhisbal_90A,max_credamount_590A,max_currdebt_94A,max_downpmt_134A,max_mainoccupationinc_437A,max_maxdpdtolerance_577P,max_outstandingdebt_522A,max_revolvingaccount_394A,min_actualdpd_943P,min_annuity_853A,min_credacc_actualbalance_314A,min_credacc_credlmt_575A,min_credacc_maxhisbal_375A,min_credacc_minhisbal_90A,min_credamount_590A,min_currdebt_94A,min_downpmt_134A,min_mainoccupationinc_437A,min_maxdpdtolerance_577P,min_outstandingdebt_522A,min_revolvingaccount_394A,first_actualdpd_943P,first_annuity_853A,first_credacc_credlmt_575A,first_credamount_590A,first_currdebt_94A,first_downpmt_134A,first_mainoccupationinc_437A,first_maxdpdtolerance_577P,first_outstandingdebt_522A,first_revolvingaccount_394A,last_actualdpd_943P,last_annuity_853A,last_credacc_actualbalance_314A,last_credacc_credlmt_575A,last_credacc_maxhisbal_375A,last_credacc_minhisbal_90A,last_credamount_590A,last_currdebt_94A,last_downpmt_134A,last_mainoccupationinc_437A,last_maxdpdtolerance_577P,last_outstandingdebt_522A,mean_actualdpd_943P,mean_annuity_853A,mean_credacc_actualbalance_314A,mean_credacc_credlmt_575A,mean_credacc_maxhisbal_375A,mean_credacc_minhisbal_90A,mean_credamount_590A,mean_currdebt_94A,mean_downpmt_134A,mean_mainoccupationinc_437A,mean_maxdpdtolerance_577P,mean_outstandingdebt_522A,mean_revolvingaccount_394A,max_approvaldate_319D,max_creationdate_885D,max_dateactivated_425D,max_dtlastpmt_581D,max_dtlastpmtallstes_3545839D,max_employedfrom_700D,max_firstnonzeroinstldate_307D,min_approvaldate_319D,min_creationdate_885D,min_dateactivated_425D,min_dtlastpmt_581D,min_dtlastpmtallstes_3545839D,min_employedfrom_700D,min_firstnonzeroinstldate_307D,first_approvaldate_319D,first_creationdate_885D,first_dateactivated_425D,first_dtlastpmt_581D,first_dtlastpmtallstes_3545839D,first_employedfrom_700D,first_firstnonzeroinstldate_307D,last_approvaldate_319D,last_creationdate_885D,last_dateactivated_425D,last_dtlastpmt_581D,last_dtlastpmtallstes_3545839D,last_employedfrom_700D,last_firstnonzeroinstldate_307D,mean_approvaldate_319D,mean_creationdate_885D,mean_dateactivated_425D,mean_dtlastpmt_581D,mean_dtlastpmtallstes_3545839D,mean_employedfrom_700D,mean_firstnonzeroinstldate_307D,max_cancelreason_3545846M,max_education_1138M,max_postype_4733339M,max_rejectreason_755M,max_rejectreasonclient_4145042M,min_cancelreason_3545846M,min_education_1138M,min_postype_4733339M,min_rejectreason_755M,min_rejectreasonclient_4145042M,first_cancelreason_3545846M,first_education_1138M,first_postype_4733339M,first_rejectreason_755M,first_rejectreasonclient_4145042M,last_cancelreason_3545846M,last_education_1138M,last_postype_4733339M,last_rejectreason_755M,last_rejectreasonclient_4145042M,mode_cancelreason_3545846M,mode_education_1138M,mode_postype_4733339M,mode_rejectreason_755M,mode_rejectreasonclient_4145042M,max_byoccupationinc_3656910L,max_childnum_21L,max_credacc_status_367L,max_credacc_transactions_402L,max_credtype_587L,max_familystate_726L,max_inittransactioncode_279L,max_isbidproduct_390L,max_isdebitcard_527L,max_pmtnum_8L,max_status_219L,max_tenor_203L,min_byoccupationinc_3656910L,min_childnum_21L,min_credacc_status_367L,min_credacc_transactions_402L,min_credtype_587L,min_familystate_726L,min_inittransactioncode_279L,min_isbidproduct_390L,min_isdebitcard_527L,min_pmtnum_8L,min_status_219L,min_tenor_203L,first_childnum_21L,first_credtype_587L,first_familystate_726L,first_inittransactioncode_279L,first_isbidproduct_390L,first_isdebitcard_527L,first_pmtnum_8L,first_status_219L,first_tenor_203L,last_byoccupationinc_3656910L,last_childnum_21L,last_credacc_status_367L,last_credacc_transactions_402L,last_credtype_587L,last_familystate_726L,last_inittransactioncode_279L,last_isbidproduct_390L,last_pmtnum_8L,last_status_219L,last_tenor_203L,max_num_group1,min_num_group1,first_num_group1,last_num_group1,max_amount_4527230A,min_amount_4527230A,first_amount_4527230A,last_amount_4527230A,mean_amount_4527230A,max_recorddate_4527225D,min_recorddate_4527225D,first_recorddate_4527225D,last_recorddate_4527225D,mean_recorddate_4527225D,max_num_group1_1_3,min_num_group1_1_3,first_num_group1_1_3,last_num_group1_1_3,max_amount_4917619A,min_amount_4917619A,first_amount_4917619A,last_amount_4917619A,mean_amount_4917619A,max_deductiondate_4917603D,min_deductiondate_4917603D,first_deductiondate_4917603D,last_deductiondate_4917603D,mean_deductiondate_4917603D,max_num_group1_1_4,min_num_group1_1_4,first_num_group1_1_4,last_num_group1_1_4,max_pmtamount_36A,min_pmtamount_36A,first_pmtamount_36A,last_pmtamount_36A,mean_pmtamount_36A,max_processingdate_168D,min_processingdate_168D,first_processingdate_168D,last_processingdate_168D,mean_processingdate_168D,max_num_group1_1_5,min_num_group1_1_5,first_num_group1_1_5,last_num_group1_1_5,max_mainoccupationinc_384A,min_mainoccupationinc_384A,first_mainoccupationinc_384A,last_mainoccupationinc_384A,mean_mainoccupationinc_384A,max_birth_259D,max_empl_employedfrom_271D,min_birth_259D,min_empl_employedfrom_271D,first_birth_259D,first_empl_employedfrom_271D,last_birth_259D,mean_birth_259D,mean_empl_employedfrom_271D,max_education_927M,max_empladdr_district_926M,max_empladdr_zipcode_114M,max_language1_981M,min_education_927M,min_language1_981M,first_education_927M,first_language1_981M,last_education_927M,last_empladdr_district_926M,last_empladdr_zipcode_114M,last_language1_981M,mode_education_927M,mode_language1_981M,max_contaddr_matchlist_1032L,max_contaddr_smempladdr_334L,max_empl_employedtotal_800L,max_empl_industry_691L,max_familystate_447L,max_housetype_905L,max_incometype_1044T,max_personindex_1023L,max_persontype_1072L,max_persontype_792L,max_relationshiptoclient_415T,max_relationshiptoclient_642T,max_remitter_829L,max_role_1084L,max_safeguarantyflag_411L,max_sex_738L,max_type_25L,min_contaddr_matchlist_1032L,min_contaddr_smempladdr_334L,min_empl_employedtotal_800L,min_empl_industry_691L,min_familystate_447L,min_housetype_905L,min_incometype_1044T,min_personindex_1023L,min_persontype_1072L,min_persontype_792L,min_relationshiptoclient_415T,min_relationshiptoclient_642T,min_remitter_829L,min_safeguarantyflag_411L,min_sex_738L,min_type_25L,first_contaddr_matchlist_1032L,first_contaddr_smempladdr_334L,first_empl_employedtotal_800L,first_empl_industry_691L,first_familystate_447L,first_housetype_905L,first_incometype_1044T,first_personindex_1023L,first_persontype_1072L,first_persontype_792L,first_role_1084L,first_safeguarantyflag_411L,first_sex_738L,first_type_25L,last_contaddr_matchlist_1032L,last_contaddr_smempladdr_334L,last_incometype_1044T,last_personindex_1023L,last_persontype_1072L,last_persontype_792L,last_relationshiptoclient_415T,last_relationshiptoclient_642T,last_remitter_829L,last_role_1084L,last_safeguarantyflag_411L,last_sex_738L,last_type_25L,max_num_group1_1_7,min_num_group1_1_7,first_num_group1_1_7,last_num_group1_1_7,max_amount_416A,min_amount_416A,first_amount_416A,last_amount_416A,mean_amount_416A,max_openingdate_313D,min_openingdate_313D,first_openingdate_313D,last_openingdate_313D,mean_openingdate_313D,max_num_group1_1_8,min_num_group1_1_8,first_num_group1_1_8,last_num_group1_1_8,max_openingdate_857D,min_openingdate_857D,first_openingdate_857D,last_openingdate_857D,mean_openingdate_857D,max_num_group1_1_9,min_num_group1_1_9,first_num_group1_1_9,last_num_group1_1_9
564688,949221,56,0,2,6,,,,,-17655.0,1.0,1.0,1.0,3.0,1.0,a55475b1,a55475b1,a55475b1,3.0,1.0,3439d993,a55475b1,3.0,,,,,,,DEDUCTION_6,,14.0,,1.0,1.0,,,1872.2001,0.0,0.0,0.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,22464.6,COL,0.0,0.0,,,,,0.0,22464.6,SBA,0.0,,0.0,,,,0.0,,POS,0.0,False,,,,a55475b1,,,a55475b1,,a55475b1,a55475b1,,,a55475b1,a55475b1,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,12.0,0.0,0.0,0.0,22464.6,0.0,0.0,,,0.0,0.0,,FO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,850.0,850.0,850.0,850.0,850.0,14.0,14.0,14.0,14.0,14.0,12.0,0.0,7.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,110000.0,110000.0,110000.0,,110000.0,-17655,-9878.0,-17655,-9878.0,-17655,-9878.0,,-17655,-9878.0,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,P10_39_147,P33_146_175,P10_39_147,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,True,MORE_FIVE,AGRICULTURE,MARRIED,,SELFEMPLOYED,0.0,1.0,1.0,,,,EM,True,M,PRIMARY_MOBILE,False,True,MORE_FIVE,AGRICULTURE,MARRIED,,SELFEMPLOYED,0.0,1.0,1.0,,,,True,M,PHONE,False,True,MORE_FIVE,AGRICULTURE,MARRIED,,SELFEMPLOYED,0.0,1.0,1.0,CL,True,M,PRIMARY_MOBILE,,,,,1.0,,,,,EM,,,PHONE,1,0,0,1,,,,,,,,,,,,,,,,,,,,,,,
1342676,1936705,90,0,9,3,,,,250533.39,-23460.0,6.0,6.0,2.0,8.0,2.0,2fc785b2,6b2ae0fa,a55475b1,1.0,5.0,b6cabe76,a55475b1,8.0,,,,,,,,,,14.0,6.0,4.0,0.0,0.0,3157.4001,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,14.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31996.0,COL,0.0,0.0,-4556.0,-1757.0,-1721.0,66.0,0.0,31996.0,SBA,0.0,-1629.0,0.0,,,-3099.0,1.0,,POS,0.0,False,,-2359.0,-993.0,a55475b1,42000.0,-2361.0,P94_109_143,-1721.0,P100_96_175,P165_57_169,90000.0,-993.0,P99_56_166,P94_109_143,D,9000.0,0.0,0.0,21000.0,,,,75.4,0.0,-1782.0,17.0,0.0,0.0,0.0,0.0,0.0,60.0,,,,,,,2.0,39.0,0.0,0.0,0.0,0.0,29.0,18.0,0.0,21.0,39.0,15.0,28.0,18.0,39.0,16.0,16.0,6.0,6.0,11.0,11.0,11.0,24.0,20.0,39.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,OTHER,OTHER,0.41026,0.30769,0.51282,0.46154,0.35897,12.0,0.0,0.0,0.0,31996.0,0.0,1.0,0.0,0.0,0.0,128366.805,,FO,,,0.0,9866.8,,0.0,,,90000.0,0.0,0.0,40000.0,60.0,0.0,,0.0,2148.2,,0.0,,,27792.0,0.0,0.0,6200.0,0.0,0.0,,0.0,9866.8,0.0,90000.0,,0.0,40000.0,,,,0.0,2148.2,,0.0,,,27792.0,0.0,0.0,6200.0,,0.0,0.0,4316.400013,,0.0,,,49698.0,0.0,0.0,18725.0,30.0,0.0,,-2361.0,-993.0,-2359.0,-1629.0,-1629.0,-3508.0,-962.0,-3128.0,-3128.0,-3124.0,-1629.0,-1629.0,-3508.0,-3099.0,,-993.0,,,,,-962.0,,-3128.0,,,,,-3099.0,-2745.0,-2360.0,-2742.0,-1629.0,-1629.0,-3508.0,-2330.0,a55475b1,a55475b1,P46_145_78,a55475b1,a55475b1,P94_109_143,P97_36_170,P149_40_170,P99_56_166,P94_109_143,P94_109_143,a55475b1,P149_40_170,P99_56_166,P94_109_143,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,a55475b1,P97_36_170,P46_145_78,a55475b1,a55475b1,54675.0,0.0,,,COL,WIDOWED,POS,False,,36.0,K,36.0,25000.0,0.0,,,CAL,WIDOWED,CASH,False,,12.0,D,12.0,,COL,WIDOWED,POS,False,,12.0,D,12.0,25000.0,0.0,,,COL,WIDOWED,POS,False,18.0,D,18.0,7.0,0.0,0.0,7.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23000.0,23000.0,23000.0,23000.0,23000.0,-23460,,-23460,,-23460,,-23460.0,-23460,,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,P209_127_106,a55475b1,P209_127_106,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,P209_127_106,False,False,,,,OWNED,RETIRED_PENSIONER,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,False,False,,,,OWNED,RETIRED_PENSIONER,0.0,1.0,1.0,,,,True,F,PRIMARY_MOBILE,False,False,,,,OWNED,RETIRED_PENSIONER,0.0,1.0,1.0,CL,True,F,PRIMARY_MOBILE,False,False,RETIRED_PENSIONER,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,0,0,0,0,2792.086,2792.086,2792.086,2792.086,2792.086,-2359.0,-2359.0,-2359.0,-2359.0,-2359.0,0.0,0.0,0.0,0.0,-2359.0,-2359.0,-2359.0,-2359.0,-2359.0,0.0,0.0,0.0,0.0
740616,1334645,14,0,4,2,,,-9474.0,,-9474.0,2.0,2.0,1.0,4.0,2.0,a55475b1,6b2ae0fa,a55475b1,7.0,8.0,a7fcb6e5,a55475b1,4.0,,,,,6.0,13024.632,,14.0,,,6.0,8.0,0.0,49081.0,2121.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,,,22.0,5631.2,,90.0,5713.8003,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,19.0,0.0,30860.0,COL,0.0,0.0,-524.0,-179.0,-174.0,181.0,0.0,30860.0,SBA,0.0,,0.0,,-525.0,-810.0,0.0,,POS,0.0,False,,-417.0,-355.0,P100_96_175,22800.0,-419.0,P94_109_143,-174.0,P159_130_59,a55475b1,17980.0,-355.0,P94_109_143,P94_109_143,D,70000.0,0.0,0.0,1139200.0,,163.0,163.0,49250.06,92.0,-264.0,17.0,163.0,163.0,0.0,163.0,163.0,163.0,13393.601,,43541.4,,-7.0,,2.0,22.0,0.0,0.0,0.0,0.0,39.0,3.0,0.0,20.0,37.0,11.0,23.0,,,3.0,,0.0,,,2.0,,,19.0,8.0,,0.0,,0.0,,0.0,0.0,0.0,False,OTHER,OTHER,0.08333,0.33333,0.52778,0.41667,0.41667,18.0,0.0,0.0,0.0,30860.0,0.0,0.0,0.0,,0.0,116141.4,,FO,,,0.0,10491.2,,0.0,,,100000.0,0.0,0.0,100000.0,163.0,0.0,,0.0,2003.2001,,0.0,,,8398.0,0.0,0.0,30000.0,0.0,0.0,,0.0,2003.2001,0.0,17980.0,,0.0,100000.0,,,,0.0,2334.8,,0.0,,,41994.0,0.0,0.0,30000.0,112.0,0.0,0.0,4557.866683,,0.0,,,35528.333333,0.0,0.0,59333.333333,68.75,0.0,,-419.0,-355.0,-417.0,-90.0,-90.0,-875.0,-325.0,-841.0,-841.0,-839.0,-607.0,-607.0,-875.0,-810.0,,-355.0,,,,,-325.0,-841.0,-841.0,-839.0,-137.0,-137.0,-875.0,-810.0,-617.0,-544.0,-615.0,-314.0,-314.0,-875.0,-514.0,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,P94_109_143,P33_146_175,a55475b1,P94_109_143,P94_109_143,P94_109_143,a55475b1,a55475b1,P94_109_143,P94_109_143,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,,0.0,,,COL,SINGLE,POS,False,,24.0,K,24.0,,0.0,,,CAL,SINGLE,CASH,False,,3.0,D,3.0,,COL,,POS,False,,11.0,D,11.0,,0.0,,,COL,SINGLE,POS,False,24.0,K,24.0,5.0,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2432.5,1663.22,2412.872,2432.5,2170.772033,8.0,-139.0,-19.0,-139.0,-66.0,5.0,0.0,1.0,5.0,70000.0,70000.0,70000.0,70000.0,70000.0,-9474,,-9474,,-9474,,-9474.0,-9474,,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,P209_127_106,a55475b1,P209_127_106,a55475b1,a55475b1,a55475b1,P209_127_106,a55475b1,P209_127_106,False,False,,,,,PRIVATE_SECTOR_EMPLOYEE,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,False,False,,,,,PRIVATE_SECTOR_EMPLOYEE,0.0,1.0,1.0,,,,True,F,PRIMARY_MOBILE,False,False,,,,,PRIVATE_SECTOR_EMPLOYEE,0.0,1.0,1.0,CL,True,F,PRIMARY_MOBILE,False,False,PRIVATE_SECTOR_EMPLOYEE,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,
995276,1589305,41,0,10,5,,,,,-15265.0,2.0,2.0,0.0,5.0,2.0,a55475b1,6b2ae0fa,a55475b1,5.0,3.0,3439d993,a55475b1,5.0,,,,,,,DEDUCTION_6,,14.0,,2.0,3.0,0.0,77008.95,1247.8,0.0,0.0,0.0,0.0,2.0,1.0,1.0,-4.0,,-4.0,0.0,6417.4,,0.0,0.0,7489.4,,INSTANT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0,38198.0,REL,0.0,0.0,,,,1.0,0.0,20798.0,SBA,0.0,-180.0,,,,-515.0,1.0,20798.0,POS,,False,False,-539.0,-546.0,P53_45_92,58956.0,-546.0,a55475b1,,a55475b1,a55475b1,,,a55475b1,a55475b1,K,40000.0,0.0,0.0,27809.748,,0.0,,58956.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,22926.4,58956.0,47349.746,,-8.0,-8.0,1.0,12.0,0.0,0.0,1.0,0.0,12.0,10.0,0.0,13.0,0.0,0.0,14.0,12.0,13.0,10.0,10.0,4.0,4.0,4.0,4.0,4.0,13.0,0.0,13.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,OTHER,OTHER,0.76923,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,20798.0,0.0,0.0,0.0,0.0,0.0,77009.75,,FO,AL,,0.0,4916.6,,0.0,,,58956.0,0.0,0.0,40000.0,0.0,0.0,,0.0,4916.6,,0.0,,,58956.0,0.0,0.0,40000.0,0.0,0.0,,0.0,4916.6,0.0,58956.0,0.0,0.0,40000.0,0.0,0.0,,0.0,4916.6,,0.0,,,58956.0,0.0,0.0,40000.0,0.0,0.0,0.0,4916.6,,0.0,,,58956.0,0.0,0.0,40000.0,0.0,0.0,,-546.0,-546.0,-539.0,-180.0,-180.0,-1372.0,-515.0,-546.0,-546.0,-539.0,-180.0,-180.0,-1372.0,-515.0,-546.0,-546.0,-539.0,-180.0,-180.0,-1372.0,-515.0,-546.0,-546.0,-539.0,-180.0,-180.0,-1372.0,-515.0,-546.0,-546.0,-539.0,-180.0,-180.0,-1372.0,-515.0,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,a55475b1,a55475b1,a55475b1,,5.0,,,COL,MARRIED,POS,False,,16.0,K,16.0,,5.0,,,COL,MARRIED,POS,False,,16.0,K,16.0,5.0,COL,MARRIED,POS,False,,16.0,K,16.0,,5.0,,,COL,MARRIED,POS,False,16.0,K,16.0,0.0,0.0,0.0,0.0,2415.4001,222.8,1098.0,222.8,1615.700062,14.0,14.0,14.0,14.0,14.0,7.0,0.0,0.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16000.0,16000.0,16000.0,,16000.0,-15265,-1372.0,-15265,-1372.0,-15265,-1372.0,,-15265,-1372.0,a55475b1,a55475b1,a55475b1,a55475b1,P33_146_175,P10_39_147,P33_146_175,P10_39_147,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,False,False,MORE_FIVE,OTHER,MARRIED,,SALARIED_GOVT,1.0,4.0,4.0,SPOUSE,SPOUSE,False,PE,True,F,PRIMARY_MOBILE,False,False,MORE_FIVE,OTHER,MARRIED,,SALARIED_GOVT,0.0,1.0,1.0,SPOUSE,SPOUSE,False,True,F,PHONE,False,False,MORE_FIVE,OTHER,MARRIED,,SALARIED_GOVT,0.0,1.0,1.0,CL,True,F,PRIMARY_MOBILE,,,,,4.0,,,SPOUSE,,PE,,,PHONE,2,0,0,2,,,,,,,,,,,,,,,,,,,,,,,
1309958,1903987,86,0,8,2,,,,0.0,-13051.0,0.0,0.0,0.0,2.0,0.0,2fc785b2,6b2ae0fa,a55475b1,1.0,3.0,a7fcb6e5,a55475b1,2.0,,,,,,,,,,14.0,3.0,4.0,0.0,0.0,5631.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63598.0,COL,0.0,0.0,-4527.0,,,0.0,0.0,63598.0,SBA,0.0,-3059.0,0.0,,-1668.0,-3125.0,1.0,,POS,0.0,False,,-3136.0,-3156.0,P52_56_90,3838.0,-3156.0,a55475b1,,a55475b1,a55475b1,,,a55475b1,a55475b1,K,10000.0,0.0,0.0,5476.4,,,,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,2.0,6.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,6.0,0.0,0.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,OTHER,OTHER,1.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,63598.0,0.0,0.0,0.0,0.0,0.0,4695.2,,FO,,,0.0,782.8,,0.0,,,3838.0,0.0,960.0,10000.0,0.0,0.0,,0.0,782.8,,0.0,,,3838.0,0.0,960.0,10000.0,0.0,0.0,,0.0,782.8,0.0,3838.0,0.0,960.0,10000.0,0.0,0.0,,0.0,782.8,,0.0,,,3838.0,0.0,960.0,10000.0,0.0,0.0,0.0,782.8,,0.0,,,3838.0,0.0,960.0,10000.0,0.0,0.0,,-3156.0,-3156.0,-3136.0,,,-5155.0,-3125.0,-3156.0,-3156.0,-3136.0,,,-5155.0,-3125.0,-3156.0,-3156.0,-3136.0,,,-5155.0,-3125.0,-3156.0,-3156.0,-3136.0,,,-5155.0,-3125.0,-3156.0,-3156.0,-3136.0,,,-5155.0,-3125.0,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,a55475b1,P97_36_170,P177_117_192,a55475b1,a55475b1,,0.0,,,COL,SINGLE,POS,False,,6.0,K,6.0,,0.0,,,COL,SINGLE,POS,False,,6.0,K,6.0,0.0,COL,SINGLE,POS,False,,6.0,K,6.0,,0.0,,,COL,SINGLE,POS,False,6.0,K,6.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,51292.402,644.2,29893.8,5697.6,17594.216833,8.0,-146.0,-146.0,-138.0,-80.0,11.0,0.0,0.0,1.0,,,,,,,,,,,,,,,50000.0,50000.0,50000.0,50000.0,50000.0,-13051,-5155.0,-13051,-5155.0,-13051,-5155.0,-13051.0,-13051,-5155.0,a55475b1,a55475b1,a55475b1,P10_39_147,a55475b1,P10_39_147,a55475b1,P10_39_147,a55475b1,a55475b1,a55475b1,P10_39_147,a55475b1,P10_39_147,False,False,,,,PARENTAL,SALARIED_GOVT,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,False,False,,,,PARENTAL,SALARIED_GOVT,0.0,1.0,1.0,,,,True,F,PRIMARY_MOBILE,False,False,,,,PARENTAL,SALARIED_GOVT,0.0,1.0,1.0,CL,True,F,PRIMARY_MOBILE,False,False,SALARIED_GOVT,0.0,1.0,1.0,,,,CL,True,F,PRIMARY_MOBILE,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,


In [66]:
test_df['target'].head(10), ypred[:10]

(564688     0
 1342676    0
 740616     0
 995276     0
 1309958    0
 194279     0
 874456     0
 1221594    0
 450319     0
 736633     0
 Name: target, dtype: int64,
 array([0.01155312, 0.01155312, 0.01155312, 0.01155312, 0.01155312,
        0.01155312, 0.01155312, 0.01155312, 0.01155312, 0.01155312]))

In [73]:
test_df['target'].sum()

9599

In [72]:
np.unique(ypred, return_counts=True)

(array([0.01155312, 0.7784853 ]), array([295733,   9599]))

In [170]:
xdf = train_df[['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 
                 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 
                 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 
                 'inittransactionamount_650A','lastapprcredamount_781A', 'lastrejectcredamount_222A', 'maininc_215A', 'maxannuity_159A',
                 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A',
                 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'price_1097A', 
                 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 
                 'totinstallast1m_4525188A', 'description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 
                 'maritalst_893M', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtssum_45A', 'target']]

In [171]:
tr_df, te_df = train_test_split(xdf, stratify=xdf['target'], test_size=0.2)
tr_df, va_df = train_test_split(tr_df, stratify=tr_df['target'], test_size=0.1)
trd = lgb.Dataset(tr_df[[x for x in tr_df.columns if x != 'target']], 
                  label=tr_df['target'], 
                  categorical_feature=['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M']
                 )
ted = lgb.Dataset(te_df[[x for x in te_df.columns if x != 'target']], 
                  label=te_df['target'], 
                  categorical_feature=['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M']
                 )
vad = lgb.Dataset(va_df[[x for x in va_df.columns if x != 'target']], 
                  label=va_df['target'], 
                  categorical_feature=['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M']
                 )

In [172]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

chck = lgb.train(params, trd, valid_sets=vad, callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)])



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.693382
[100]	valid_0's auc: 0.707502
[150]	valid_0's auc: 0.713174
[200]	valid_0's auc: 0.716931
[250]	valid_0's auc: 0.720201
[300]	valid_0's auc: 0.72282
[350]	valid_0's auc: 0.724252
[400]	valid_0's auc: 0.726079
[450]	valid_0's auc: 0.727473
[500]	valid_0's auc: 0.728876
[550]	valid_0's auc: 0.729794
[600]	valid_0's auc: 0.730611
[650]	valid_0's auc: 0.731657
[700]	valid_0's auc: 0.732023
[750]	valid_0's auc: 0.732569
Early stopping, best iteration is:
[764]	valid_0's auc: 0.73284


In [173]:
ypred = chck.predict(te_df[[x for x in te_df.columns if x != 'target']])

In [174]:
roc_auc_score(te_df['target'], ypred)

0.735416318264719

In [86]:
tr_df.iloc[:, :-1]

Unnamed: 0,dateofbirth_337D,education_1103M,credamount_770A,annuity_780A,price_1097A
1462243,-26397.0,a55475b1,70000.0,7953.4000,0.0
312003,-17372.0,a55475b1,27278.0,2273.2000,27278.0
978938,-15562.0,a55475b1,28898.0,2408.2000,38898.0
933185,-14129.0,a55475b1,27452.4,2287.8000,27452.4
631232,-24263.0,6b2ae0fa,5980.0,1184.6000,5980.0
...,...,...,...,...,...
929628,-17691.0,a55475b1,27650.0,1536.2001,27650.0
265955,,a55475b1,17978.0,1665.8000,17978.0
38092,,a55475b1,30000.0,3536.2000,
1376314,-24366.0,6b2ae0fa,60000.0,11766.2000,0.0


In [84]:
from sklearn.linear_model import SGDClassifier

In [114]:
sgd = SGDClassifier(max_iter=1000, loss="log_loss", class_weight="balanced")

In [119]:
sgd.fit(tr_df[[x for x in tr_df.columns if x not in ['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 
                 'maritalst_893M', 'target']]].fillna(0), tr_df['target'])

In [120]:
pred = sgd.predict(te_df[[x for x in tr_df.columns if x not in ['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 
                 'maritalst_893M', 'target']]].fillna(0))

In [121]:
roc_auc_score(te_df['target'], pred)

0.5231452060506899

In [146]:
f1_score(te_df['target'], ypred)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [124]:
accuracy_score(te_df['target'], pred)

0.8189924099137825