In [None]:
# Home Credit Ensemble Model

In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pkl
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier, DMatrix
from xgboost.callback import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'

ModuleNotFoundError: No module named 'imblearn'

In [2]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.95:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df



class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]
        expr_var = [pl.var(col).alias(f"var_{col}") for col in cols]

        return expr_max + expr_last + expr_mean 

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]

        return expr_max + expr_last + expr_mean 

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return expr_max + expr_last  # +expr_count

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [3]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df


def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

In [5]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store
df_train = df_train.pipe(Pipeline.filter_cols)
print("train data shape:\t", df_train.shape)
gc.collect()


train data shape:	 (1526659, 861)
train data shape:	 (1526659, 472)


0

In [6]:
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums = df_train.select_dtypes(exclude='category').columns
from itertools import combinations, permutations

# df_train=df_train[nums]
nans_df = df_train[nums].isna()
nans_groups = {}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group] = [col]
del nans_df;
x = gc.collect()


def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0;
        vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n > mx:
                mx = n
                vx = gg
            # print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        # print()
    print('Use these', use)
    return use


def group_columns_by_correlation(matrix, threshold=0.8):
    # 计算列之间的相关性
    correlation_matrix = matrix.corr()

    # 分组列
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]

    return groups


uses = []
for k, v in nans_groups.items():
    if len(v) > 1:
        Vs = nans_groups[k]
        # cross_features=list(combinations(Vs, 2))
        # make_corr(Vs)
        grps = group_columns_by_correlation(df_train[Vs], threshold=0.8)
        use = reduce_group(grps)
        uses = uses + use
        # make_corr(use)
    else:
        uses = uses + v
    print('####### NAN count =', k)
print(uses)
print(len(uses))
uses = uses + list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train = df_train[uses]

Memory usage of dataframe is 4322.75 MB
Memory usage after optimization is: 1528.81 MB
Decreased by 64.6%
train data shape:	 (1526659, 472)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D', 'max_num_group1_9']
####### NAN

In [7]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [8]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()

df_test = df_test.select([col for col in df_train.columns if col not in ["target"]])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (10, 860)
train data shape:	 (1526659, 389)
test data shape:	 (10, 388)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 40.3%


0

In [9]:
#df_train.to_parquet('/kaggle/working/train_v2.parquet')

# Checkpoint

In [1]:
import joblib
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import pickle as pkl
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier, DMatrix
from xgboost.callback import EarlyStopping
import data_proc as dp
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

In [2]:
base, X, y = dp.load_data('data/train_v3_filled_woe.parquet')
X4 = pd.read_parquet('data/train_v4.parquet')
X = X.merge(X4, left_index=True, right_index=True, how='left').astype(float)

del X4
gc.collect()

0

In [4]:
obj_cols = list(X.select_dtypes("object").columns)
X[obj_cols] = X[obj_cols].astype("category")

In [5]:
cat_cols = list(X.select_dtypes("category").columns)

In [6]:
len(cat_cols+list(X.select_dtypes(include=np.number).columns)), len(X.columns)

(969, 969)

In [7]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(0)

In [9]:
#X = df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
#y = df_train["target"]
weeks = base["WEEK_NUM"]
state = 42
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

lgb_params = {'num_leaves': 266, 'min_data_in_leaf': 731, 'learning_rate': 0.005901835737658873, 'lambda_l1': 0.0010634148733296234,
'lambda_l2': 5.835448638038832, 'feature_fraction': 0.5106655665557869, 'bagging_fraction': 0.7534811540507564, 'bagging_freq': 4,
             'objective': 'binary',
    "metric": "auc", # stability metric is used as eval_metric
    'verbosity': -1,
    'device': 'gpu',
    'n_jobs': -1,
    'max_bin': 255,
    'n_estimators': 5000
}

fitted_models_xgb = []
fitted_models_lgb = []
fitted_models_cat = []
cv_scores_xgb = []
cv_scores_lgb = []
cv_scores_cat = []
cat_params =  {'learning_rate': 0.07746171181807705, 'l2_leaf_reg': 0.16856620630061742,  'iterations': 5000, 
               'bagging_temperature': 0.4939947354720309, 'random_strength': 0.10365883602086806, 'depth': 5, 'min_data_in_leaf': 90,
              'objective': 'Logloss',
            'eval_metric': 'Logloss', # auc and custom functions are not working on gpu
            'verbose': 50, 
            'task_type': 'GPU',
            'thread_count': -1
}

xgb_params = {
    'n_estimators': 5000,
    'learning_rate': 0.02563139404535397, 
    'max_depth': 17, 
    'min_child_weight': 1446, 
    'max_delta_setp': 0,
    'subsample': 0.8564298564731834, 
    'colsample_bytree': 0.9235898601649664, 
    'reg_lambda': 1.1289608466365559e-08, 
    'reg_alpha': 4.911518931389214e-07, 
    'gamma': 0.00035869889925144383, 
    
    'objective': 'binary:logistic',
     'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'enable_categorical': True,
    'verbosity': 0,
    'device': 'cuda',
    'n_jobs': -1,
}

for i, (idx_train, idx_valid) in enumerate(cv.split(X, y, groups=weeks)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    """es = EarlyStopping(rounds=100, 
                           maximize=True,
                           save_best=True,)
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                  callbacks=[es],
    )

    fitted_models_xgb.append(model)
    joblib.dump(model, f'models/base95_2pipe_5cv_xgb_{i}.pkl')

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_xgb.append(auc_score)
"""
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(400), lgb.early_stopping(10)] )
    fitted_models_lgb.append(model)
    joblib.dump(model, f'models/base95_2pipe_5cv_lgb_{i}.pkl')

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)

    X_train[cat_cols] = X_train[cat_cols].astype(str)
    X_valid[cat_cols] = X_valid[cat_cols].astype(str)

    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(**cat_params)

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    fitted_models_cat.append(clf)
    joblib.dump(clf, f'models/base95_2pipe_cat_{i}.pkl')

    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    
    
print("CV AUC scores lgb: ", cv_scores_lgb)
print("Maximum CV AUC score lgb: ", max(cv_scores_lgb))
print("CV AUC scores cat: ", cv_scores_cat)
print("Maximum CV AUC score cat: ", max(cv_scores_cat))


Training until validation scores don't improve for 10 rounds
[400]	valid_0's auc: 0.843786
[800]	valid_0's auc: 0.851905
[1200]	valid_0's auc: 0.856075
[1600]	valid_0's auc: 0.858399
[2000]	valid_0's auc: 0.859612
[2400]	valid_0's auc: 0.860363
Early stopping, best iteration is:
[2503]	valid_0's auc: 0.860549
0:	learn: 0.5616642	test: 0.5610688	best: 0.5610688 (0)	total: 41.6ms	remaining: 3m 28s
300:	learn: 0.1115924	test: 0.1095894	best: 0.1095894 (300)	total: 8.01s	remaining: 2m 5s
600:	learn: 0.1089637	test: 0.1084868	best: 0.1084868 (600)	total: 15.4s	remaining: 1m 52s
900:	learn: 0.1070825	test: 0.1080491	best: 0.1080491 (900)	total: 22.6s	remaining: 1m 42s
1200:	learn: 0.1054461	test: 0.1077539	best: 0.1077535 (1199)	total: 29.7s	remaining: 1m 33s
1500:	learn: 0.1039738	test: 0.1075930	best: 0.1075923 (1499)	total: 36.8s	remaining: 1m 25s
1800:	learn: 0.1025994	test: 0.1074804	best: 0.1074774 (1794)	total: 44s	remaining: 1m 18s
2100:	learn: 0.1013278	test: 0.1074279	best: 0.10742

In [None]:
#/kaggle/input/catlgb/other/slug/1/archive/cat_models.joblib
#lgb_models = []
# TODO load trained models
#n_fold = 5
#for i in range(n_fold):
#    with open(f'/kaggle/input/home-credit-lgb/home-credit-lgb/model_{i}.pkl', 'rb') as fin:
#        lgb_models.append(pkl.load(fin))
#print('load lgb done.')

In [None]:
#cat_models = []

# TODO load trained models
#n_fold = 5
#for i in range(n_fold):
#    with open(f'/kaggle/input/home-credit-cab/home-credit-cab/model_{i}.pkl', 'rb') as fin:
#        cat_models.append(pkl.load(fin))
#print('load cat done.')

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
#     def predict_proba(self, X):
#         y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
#         return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
#         weights = [0.6] * 5 + [0.4] * 5
        X[cat_cols] = X[cat_cols].astype(str)
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:3]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[3:]]
        
#         y_preds = [item * weights[i] for i, item in enumerate(y_preds)]
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat + fitted_models_lgb)

In [None]:
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")

y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm