In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [2]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df


class Aggregator:
    @staticmethod
    def num_expr(df):
        # 筛选出列名以"P"或"A"结尾的数值列
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        # 为每个数值列构建一个计算最大值的聚合表达式
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        #得到的结果是一个列表，这个列表中的每个元素都是一个Polars表达式。
        #这些表达式定义了对原始数据框架中的某些列进行聚合操作（在这个例子中是计算最大值）并为结果赋予新的列名。
        #列表中的每个表达式对应于原数据中的一个列，
        #执行聚合操作后，每个表达式会生成一个新的列，列名为原列名前加上"max_"前缀。

        return expr_max

    @staticmethod
    def date_expr(df):
        # 筛选出列名以"D"结尾的日期列
        cols = [col for col in df.columns if col[-1] in ("D",)]

        # 为每个日期列构建一个计算最大值的聚合表达式
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def str_expr(df):
        # 筛选出列名以"M"结尾的字符串列
        cols = [col for col in df.columns if col[-1] in ("M",)]

        # 为每个字符串列构建一个计算最大值的聚合表达式
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def other_expr(df):
        # 筛选出列名以"T"或"L"结尾的其他类型列
        cols = [col for col in df.columns if col[-1] in ("T", "L")]

        # 为这些其他类型列构建一个计算最大值的聚合表达式
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def count_expr(df):
        # 筛选出列名中包含"num_group"的列，这些列可能涉及分组计数或索引
        cols = [col for col in df.columns if "num_group" in col]
        # 为这些分组计数或索引列构建一个计算最大值的聚合表达式
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def get_exprs(df):
        # 组合上述所有类型的聚合表达式
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)
        # 返回包含所有聚合表达式的列表
        return exprs

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

In [4]:
%%time
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

CPU times: user 4min 38s, sys: 1min 39s, total: 6min 17s
Wall time: 2min 14s


In [5]:
%%time
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns

from itertools import combinations, permutations
#df_train=df_train[nums]
nans_df = df_train[nums].isna()
nans_groups={}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    # 计算列之间的相关性
    correlation_matrix = matrix.corr()

    # 分组列
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            #cross_features=list(combinations(Vs, 2))
            #make_corr(Vs)
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
            #make_corr(use)
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(uses)
print(len(uses))
uses=uses+list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train=df_train[uses]

train data shape:	 (1526659, 487)
Memory usage of dataframe is 3079.35 MB
Memory usage after optimization is: 1043.95 MB
Decreased by 66.1%
train data shape:	 (1526659, 327)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D

In [6]:
sample = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
device='gpu'
#n_samples=200000
n_est=6000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:60000]
    #n_samples=10000
    n_est=600
print(device)

cpu


In [7]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [8]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()
df_test = df_test.select([col for col in df_train.columns if col != "target"])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (10, 486)
train data shape:	 (60000, 291)
test data shape:	 (10, 290)
Memory usage of dataframe is 0.03 MB
Memory usage after optimization is: 0.01 MB
Decreased by 43.8%


0

### Feature Selection

In [9]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)


In [10]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [11]:
df_train

Unnamed: 0,month_decision,weekday_decision,credamount_770A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_867L,clientscnt_1022L,clientscnt_100L,...,max_collater_typofvalofguarant_407M,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_subjectroles_name_541M,max_subjectroles_name_838M,max_cacccardblochreas_147M,max_conts_type_509L,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M
0,1,4,30000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1,4,19999.800781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1,5,78000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,PRIMARY_MOBILE,,,
3,1,4,40000.000000,0.0,1.0,0.0,2.0,1.0,0.0,0.0,...,,,,,,,PRIMARY_MOBILE,,,
4,1,5,44000.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,PRIMARY_MOBILE,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,1,6,60000.000000,0.0,1.0,0.0,0.0,8.0,0.0,0.0,...,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,PRIMARY_MOBILE,,,
59996,1,4,150000.000000,0.0,2.0,0.0,182.0,17.0,0.0,0.0,...,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,PRIMARY_MOBILE,a55475b1,a55475b1,a55475b1
59997,1,4,51400.000000,0.0,0.0,0.0,96.0,1.0,0.0,0.0,...,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,PRIMARY_MOBILE,a55475b1,a55475b1,a55475b1
59998,1,4,100000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,PRIMARY_MOBILE,a55475b1,a55475b1,a55475b1


In [12]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
    "verbose": -1,
}

params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,  
    "lambda": 10,  
    "tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}


In [13]:
%time
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []


for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]# 
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train,cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid,cat_features=cat_cols)
    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.03,
    iterations=n_est)
    random_seed=3107
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    
    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    
    model2 = xgb.XGBClassifier(**params2)
    model2.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100, verbose=False)
    
    fitted_models_xgb.append(model2)
    
    y_pred_valid = model2.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_xgb.append(auc_score)
    
    del clf, model, model2
    gc.collect()
    
    
print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", max(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))

print("CV AUC scores: ", cv_scores_xgb)
print("Maximum CV AUC score: ", max(cv_scores_xgb))

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5871537	best: 0.5871537 (0)	total: 282ms	remaining: 2m 49s
300:	test: 0.7340600	best: 0.7340600 (300)	total: 25.8s	remaining: 25.6s
599:	test: 0.7395892	best: 0.7396709 (585)	total: 51s	remaining: 0us
bestTest = 0.7396708727
bestIteration = 585
Shrink model to first 586 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.73317
Early stopping, best iteration is:
[118]	valid_0's auc: 0.737427


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5744945	best: 0.5744945 (0)	total: 132ms	remaining: 1m 19s
300:	test: 0.7314073	best: 0.7314073 (300)	total: 25.5s	remaining: 25.3s
599:	test: 0.7377592	best: 0.7377592 (599)	total: 51s	remaining: 0us
bestTest = 0.7377591729
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.754726
Early stopping, best iteration is:
[210]	valid_0's auc: 0.755974


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6124578	best: 0.6124578 (0)	total: 132ms	remaining: 1m 19s
300:	test: 0.7458181	best: 0.7458532 (295)	total: 25.6s	remaining: 25.4s
599:	test: 0.7528136	best: 0.7528136 (599)	total: 51s	remaining: 0us
bestTest = 0.7528135777
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.76223
Early stopping, best iteration is:
[188]	valid_0's auc: 0.763261


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5845017	best: 0.5845017 (0)	total: 139ms	remaining: 1m 23s
300:	test: 0.7488940	best: 0.7489061 (290)	total: 25.5s	remaining: 25.3s
599:	test: 0.7534959	best: 0.7534959 (599)	total: 50.8s	remaining: 0us
bestTest = 0.7534959316
bestIteration = 599
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's auc: 0.756719


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5605085	best: 0.5605085 (0)	total: 135ms	remaining: 1m 21s
300:	test: 0.7424276	best: 0.7424276 (300)	total: 24.9s	remaining: 24.7s
599:	test: 0.7444045	best: 0.7452073 (470)	total: 49.9s	remaining: 0us
bestTest = 0.7452073395
bestIteration = 470
Shrink model to first 471 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.754071
Early stopping, best iteration is:
[176]	valid_0's auc: 0.755354
CV AUC scores:  [0.7396711182370999, 0.7377587681779298, 0.7528133766271113, 0.7534956973761854, 0.7452077741111286]
Maximum CV AUC score:  0.7534956973761854
CV AUC scores:  [0.7374266490254993, 0.7559741637881561, 0.7632610071032122, 0.7567190304333101, 0.7553542319229034]
Maximum CV AUC score:  0.7632610071032122
CV AUC scores:  [0.6978475386009164, 0.7454186744052582, 0.7237435610526801, 0.7406745146361797, 0.7320110034405949]
Maximum CV AUC score:  0.7454186744052582


In [14]:
#del y 
#del weeks
#del df_train

#gc.collect()

In [15]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:10]]
        y_preds+=y_preds #tang trong so
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[10:]]
        print(len(y_preds))
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

In [16]:
from sklearn.model_selection import cross_val_predict
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

# 第一层模型
models = [
    ('CatBoost', CatBoostClassifier(eval_metric='AUC', task_type='GPU', learning_rate=0.03, iterations=n_est, random_seed=3107)),
    ('LightGBM', LGBMClassifier(**params)),
    ('XGBoost', XGBClassifier(**params2))
]

# 第二层模型
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'n_estimators': 12,
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_samples_split': 3,
    'min_samples_leaf': 1
}

meta_model = GradientBoostingClassifier(**params)



# 存储第一层模型和相应的AUC分数
fitted_models_cb = []
fitted_models_lgb = []
fitted_models_xgb = []
cv_scores_cb = []
cv_scores_lgb = []
cv_scores_xgb = []

# 存储第一层模型的预测结果
meta_features = pd.DataFrame(index=df_train.index, columns=['CatBoost', 'LightGBM', 'XGBoost'])

for name, model in models:
    for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):
        X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]

        if name == 'CatBoost':
            X_train[cat_cols] = X_train[cat_cols].astype(str)
            X_valid[cat_cols] = X_valid[cat_cols].astype(str)
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            model.fit(train_pool, eval_set=val_pool, verbose=False)
            y_pred_valid = model.predict_proba(val_pool)[:, 1]
            fitted_models_cb.append(model)
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_cb.append(auc_score)
        elif name == 'LightGBM':
            X_train[cat_cols] = X_train[cat_cols].astype('category')
            X_valid[cat_cols] = X_valid[cat_cols].astype('category')
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)])
            fitted_models_lgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_lgb.append(auc_score)
        else:  # XGBoost
            X_train[cat_cols] = X_train[cat_cols].astype('category')
            X_valid[cat_cols] = X_valid[cat_cols].astype('category')
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
            fitted_models_xgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_xgb.append(auc_score)

        meta_features.loc[X_valid.index, name] = y_pred_valid

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.73317
Early stopping, best iteration is:
[118]	valid_0's auc: 0.737427
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.754726
Early stopping, best iteration is:
[210]	valid_0's auc: 0.755974
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.76223
Early stopping, best iteration is:
[188]	valid_0's auc: 0.763261
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[66]	valid_0's auc: 0.756719
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.754071
Early stopping, best iteration is:
[176]	valid_0's auc: 0.755354


In [17]:
meta_model.fit(meta_features, y)

# Submision

In [18]:
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")

In [19]:
test_meta_features = pd.DataFrame(index=df_test.index, columns=['CatBoost', 'LightGBM', 'XGBoost'])

In [20]:
# CatBoost


for model in fitted_models_cat:
    df_test[cat_cols] = df_test[cat_cols].astype(str)
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['CatBoost'] = test_meta_features['CatBoost'].add(y_pred_test, fill_value=0)

test_meta_features['CatBoost'] /= len(fitted_models_cat)

# LightGBM
for model in fitted_models_lgb:
    df_test[cat_cols] = df_test[cat_cols].astype("category")
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['LightGBM'] = test_meta_features['LightGBM'].add(y_pred_test, fill_value=0)

test_meta_features['LightGBM'] /= len(fitted_models_lgb)

# XGBoost
for model in fitted_models_xgb:
    df_test[cat_cols] = df_test[cat_cols].astype("category")
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['XGBoost'] = test_meta_features['XGBoost'].add(y_pred_test, fill_value=0)

test_meta_features['XGBoost'] /= len(fitted_models_xgb)

In [21]:
test_meta_features

Unnamed: 0_level_0,CatBoost,LightGBM,XGBoost
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57543,0.017658,0.010798,0.022443
57549,0.044541,0.013334,0.044943
57551,0.011901,0.008432,0.026412
57552,0.033014,0.022246,0.032758
57569,0.031007,0.019545,0.056897
57630,0.023569,0.010488,0.032659
57631,0.040524,0.032941,0.077145
57632,0.037841,0.01999,0.081421
57633,0.02575,0.022329,0.057393
57634,0.042159,0.027278,0.091284


In [22]:
y_pred = pd.Series(meta_model.predict_proba(test_meta_features)[:, 1], index=df_test.index)

In [23]:
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.015812
57549,0.024676
57551,0.015351
57552,0.031057
57569,0.024327
57630,0.019005
57631,0.036704
57632,0.032093
57633,0.028421
57634,0.032093
