In [1]:
import json
import pickle
from typing import Dict

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import log_loss, auc, roc_auc_score

pd.set_option('future.no_silent_downcasting', True)


class BaseModel:
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        self.transformations_by_feature = transformations_by_feature
        self.model = None

    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation.get('properties', set())

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)
            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                encoded = df[col].map(encoding_dict.get).astype('float64').fillna(0.0)
                df[col] = encoded
            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                df[col] = (df[col] - prop['mean']) / prop['stddev']
            elif type == 'categorical':
                df[col] = df[col].astype('category')
            else:
                pass

        return df

    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        raise NotImplementedError("Please Implement this method")


    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        raise NotImplementedError("Please Implement this method")


    def save(self, output_model_path: str, output_transformation_path: str):
        raise NotImplementedError("Please Implement this method")


    def load(self, input_model_path: str, input_transformation_path: str):
        raise NotImplementedError("Please Implement this method")



In [2]:
import json
import pickle
from typing import Dict

from lightgbm import LGBMClassifier
import lightgbm
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import log_loss, auc, roc_auc_score
from sklearn.model_selection import GridSearchCV

pd.set_option('future.no_silent_downcasting', True)


class LightGBM(BaseModel):
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        super().__init__(transformations_by_feature)

    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        print('Preprocessing...')
        df = self._preprocess(df)
        val_df = self._preprocess(val_df)

        print('Grid Searching')
        params = {'max_depth': [5, 7], 'n_estimators': [100, 500, 1000], 'colsample_bytree': [0.5, 0.75]}

        # grid_model = LGBMClassifier(boosting_type='gbdt', random_state=42, early_stopping_rounds=50)
        # gridcv = GridSearchCV(grid_model, param_grid=params, verbose=2, n_jobs=-1, cv=3)
        # gridcv.fit(df, label_array, eval_set=[(val_df, val_label_array)],
        #            eval_metric='auc',
        #            callbacks=[lightgbm.log_evaluation(period=5),
        #                       lightgbm.early_stopping(stopping_rounds=30)])
        # best_params = gridcv.best_params_

        best_params = {'max_depth': 5, 'n_estimators': 1000, 'colsample_bytree': 0.75}

        print('Fitting...')
        self.model = LGBMClassifier(boosting_type='gbdt', random_state=42, max_depth=best_params['max_depth'],
                                    n_estimators=best_params['n_estimators'], colsample_bytree=best_params['colsample_bytree'],
                                    num_leaves=24)
        self.model.fit(df, label_array, eval_set=[(val_df, val_label_array)], eval_metric='auc',
                       callbacks=[lightgbm.log_evaluation(period=5),
                                  lightgbm.early_stopping(stopping_rounds=50)])

    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        batch_size = 4096
        chunked_dfs = [df_without_label[i:i + batch_size].reset_index(drop=True) for i in range(0, len(df_without_label), batch_size)]

        preds = []
        for i in range(len(chunked_dfs)):
            chunked_df = self._preprocess(chunked_dfs[i])
            preds.append(self.model.predict_proba(chunked_df)[:, 1])

        loss, auroc = None, None
        pred = np.concatenate(preds)
        if label_array is not None:
            loss = log_loss(label_array, pred)
            auroc = roc_auc_score(label_array, pred)

        return pred, loss, auroc

    def save(self, output_model_path: str, output_transformation_path: str):
        with open(output_model_path, "wb") as fd:
            pickle.dump(self.model, fd)
        with open(output_transformation_path, 'w') as fd:
            json.dump(self.transformations_by_feature, fd)

    def load(self, input_model_path: str, input_transformation_path: str):
        with open(input_model_path, "rb") as fd:
            self.model = pickle.load(fd)

        with open(input_transformation_path, 'r') as fd:
            self.transformations_by_feature = json.load(fd)



In [3]:
import json
import pickle
from typing import Dict

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import log_loss, auc, roc_auc_score
from sklearn.model_selection import GridSearchCV

pd.set_option('future.no_silent_downcasting', True)


class XGBoost(BaseModel):
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        super().__init__(transformations_by_feature)


    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        print('Preprocessing...')
        df = self._preprocess(df)
        val_df = self._preprocess(val_df)

        # print('Grid Searching')
        # params = {'max_depth': [5, 7], 'min_child_weight': [1, 3], 'colsample_bytree': [0.5, 0.75]}
        # grid_model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True, n_estimators=100,
        #                                learning_rate=0.05, reg_alpha=0.05, scale_pos_weight=30)
        # gridcv = GridSearchCV(grid_model, param_grid=params, cv=3)
        # gridcv.fit(df, label_array, eval_set=[(val_df, val_label_array)],
        #            early_stopping_rounds=30, eval_metric='auc')
        # best_params = gridcv.best_params_

        best_params = {'max_depth': 5, 'min_child_weight': 3, 'colsample_bytree': 0.75}

        print('Fitting...')
        self.model = xgb.XGBClassifier(tree_method='hist', enable_categorical=True, max_depth=best_params['max_depth'], n_estimators=1000,
                                       min_child_weight=best_params['min_child_weight'],
                                       colsample_bytree=best_params['colsample_bytree'], colsample_bylevel=0.8, random_state=42,
                                       learning_rate=0.05, reg_alpha=0.05, scale_pos_weight=30)
        self.model.fit(df, label_array, eval_set=[(val_df, val_label_array)],
                       early_stopping_rounds=50, eval_metric='auc', verbose=5)

    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        batch_size = 4096
        chunked_dfs = [df_without_label[i:i + batch_size].reset_index(drop=True) for i in range(0, len(df_without_label), batch_size)]

        preds = []
        for i in range(len(chunked_dfs)):
            chunked_df = self._preprocess(chunked_dfs[i])
            preds.append(self.model.predict_proba(chunked_df)[:, 1])

        loss, auroc = None, None
        pred = np.concatenate(preds)
        if label_array is not None:
            loss = log_loss(label_array, pred)
            auroc = roc_auc_score(label_array, pred)

        return pred, loss, auroc

    def save(self, output_model_path: str, output_transformation_path: str):
        with open(output_model_path, "wb") as fd:
            pickle.dump(self.model, fd)
        with open(output_transformation_path, 'w') as fd:
            json.dump(self.transformations_by_feature, fd)

    def load(self, input_model_path: str, input_transformation_path: str):
        with open(input_model_path, "rb") as fd:
            self.model = pickle.load(fd)

        with open(input_transformation_path, 'r') as fd:
            self.transformations_by_feature = json.load(fd)



In [4]:
import json
import pickle
from typing import Dict

from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
import lightgbm
import numpy as np
import pandas as pd

from sklearn.metrics import log_loss, auc, roc_auc_score
from sklearn.model_selection import GridSearchCV

pd.set_option('future.no_silent_downcasting', True)


class CatBoost(BaseModel):
    def __init__(self,
                 transformations_by_feature: Dict[str, object] = None):
        super().__init__(transformations_by_feature)

    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        for col, transformation in self.transformations_by_feature.items():
            type = transformation['type']
            prop = transformation.get('properties', set())

            if type == 'onehot':
                onehot = pd.DataFrame(np.zeros((len(df[col]), len(prop['vocab']))))
                for i, vocab in enumerate(prop['vocab']):
                    rows = df[col].index[df[col] == vocab]
                    onehot.loc[rows, i] = 1

                df = df.drop(columns=[col])
                df = pd.concat([df, onehot], axis=1)
            elif type == 'target_encoding':
                encoding_dict = dict(zip(prop['value'], prop['encoded']))
                encoded = df[col].map(encoding_dict.get).astype('float64').fillna(0.0)
                df[col] = encoded
            elif type == 'binning':
                boundaries = [[float('-inf')] + prop['boundaries'] + [float('inf')]]
                for i in range(len(boundaries) - 1):
                    df[col][(df[col] >= boundaries[i]) & (df[col] < boundaries[i + 1])] = i
            elif type == 'standardization':
                df[col] = (df[col] - prop['mean']) / prop['stddev']
            elif type == 'categorical':
                df[col] = df[col].astype('str')
            else:
                pass

        return df

    def fit(self, df: pd.DataFrame, label_array: np.array,
        val_df: pd.DataFrame, val_label_array: np.array):
        print('Preprocessing...')
        df = self._preprocess(df)
        val_df = self._preprocess(val_df)

        print('Grid Searching')
        # params = {'max_depth': [5, 7], 'n_estimators': [100, 500, 1000], 'colsample_bytree': [0.5, 0.75]}

        # grid_model = LGBMClassifier(boosting_type='gbdt', random_state=42, early_stopping_rounds=50)
        # gridcv = GridSearchCV(grid_model, param_grid=params, verbose=2, n_jobs=-1, cv=3)
        # gridcv.fit(df, label_array, eval_set=[(val_df, val_label_array)],
        #            eval_metric='auc',
        #            callbacks=[lightgbm.log_evaluation(period=5),
        #                       lightgbm.early_stopping(stopping_rounds=30)])
        # best_params = gridcv.best_params_

        best_params = {'max_depth': 5, 'iterations': 1000, 'colsample_bylevel': 0.75}

        cat_cols = [col for col, transformation in self.transformations_by_feature.items() if transformation['type'] == 'categorical']

        train_pool = Pool(df, label_array, cat_features=cat_cols)
        val_pool = Pool(val_df, val_label_array, cat_features=cat_cols)

        print('Fitting...')
        self.model = CatBoostClassifier(
            eval_metric='AUC',
            learning_rate=0.07,
            iterations=best_params['iterations'],
            colsample_bylevel=best_params['colsample_bylevel'],
            max_depth=best_params['max_depth'])
        self.model.fit(train_pool, eval_set=val_pool, verbose=5, early_stopping_rounds=50)


    def predict(self, df_without_label: pd.DataFrame, label_array: np.ndarray = None):
        batch_size = 4096
        chunked_dfs = [df_without_label[i:i + batch_size].reset_index(drop=True) for i in range(0, len(df_without_label), batch_size)]

        preds = []
        for i in range(len(chunked_dfs)):
            chunked_df = self._preprocess(chunked_dfs[i])
            preds.append(self.model.predict_proba(chunked_df)[:, 1])

        loss, auroc = None, None
        pred = np.concatenate(preds)
        if label_array is not None:
            loss = log_loss(label_array, pred)
            auroc = roc_auc_score(label_array, pred)

        return pred, loss, auroc

    def save(self, output_model_path: str, output_transformation_path: str):
        with open(output_model_path, "wb") as fd:
            pickle.dump(self.model, fd)
        with open(output_transformation_path, 'w') as fd:
            json.dump(self.transformations_by_feature, fd)

    def load(self, input_model_path: str, input_transformation_path: str):
        with open(input_model_path, "rb") as fd:
            self.model = pickle.load(fd)

        with open(input_transformation_path, 'r') as fd:
            self.transformations_by_feature = json.load(fd)



In [5]:
import polars as pl
import pyarrow.parquet as pq
from glob import glob

def read_file(path, depth=None, columns=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df, columns))
    return df


def read_files(regex_path, depth=None, columns=None):
    chunks = []

    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df, columns))
        chunks.append(df)

    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def to_pandas(df_data):
    return df_data.to_pandas()

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type) == "category":
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [6]:
import numpy as np

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  # !!?
                df = df.with_columns(pl.col(col).dt.total_days())  # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df


class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        return (
            []
            + Aggregator.get_last_cols(df, cols, columns)
            + Aggregator.get_max_cols(df, cols, columns)
            + Aggregator.get_min_cols(df, cols, columns)
            + Aggregator.get_mean_cols(df, cols, columns)
            + Aggregator.get_median_cols(df, cols, columns)
        )


    def date_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("D")]
        return (
                []
                + Aggregator.get_last_cols(df, cols, columns)
                + Aggregator.get_max_cols(df, cols, columns)
                + Aggregator.get_min_cols(df, cols, columns)
                + Aggregator.get_mean_cols(df, cols, columns)
                + Aggregator.get_median_cols(df, cols, columns)
        )

    def str_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        return (
                []
                + Aggregator.get_last_cols(df, cols, columns)
                + Aggregator.get_max_cols(df, cols, columns)
                + Aggregator.get_min_cols(df, cols, columns)
        )

    def other_expr(df, columns=None):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        return (
                []
                + Aggregator.get_last_cols(df, cols, columns)
                + Aggregator.get_max_cols(df, cols, columns)
                + Aggregator.get_min_cols(df, cols, columns)
        )

    def count_expr(df, columns=None):
        cols = [col for col in df.columns if "num_group" in col]
        return (
                []
                + Aggregator.get_last_cols(df, cols, columns)
                + Aggregator.get_max_cols(df, cols, columns)
                + Aggregator.get_min_cols(df, cols, columns)
        )

    def filter_columns(dfs, columns):
        if columns is None or len(columns) == 0:
            return dfs
        return [df for df in dfs if
                df.meta.output_name() in columns or any(col.startswith(df.meta.output_name()) for col in columns)]

    def get_exprs(df, columns):
        exprs = Aggregator.num_expr(df, columns) + \
                Aggregator.date_expr(df, columns) + \
                Aggregator.str_expr(df, columns) + \
                Aggregator.other_expr(df, columns) + \
                Aggregator.count_expr(df, columns)

        return exprs

    def get_max_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.max(col).alias(f"max_{col}") for col in target_cols], cols)

    def get_min_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.min(col).alias(f"min_{col}") for col in target_cols], cols)

    def get_last_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.last(col).alias(f"last_{col}") for col in target_cols], cols)

    def get_sum_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.sum(col).alias(f"sum_{col}") for col in target_cols], cols)

    def get_count_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.count(col).alias(f"count_{col}") for col in target_cols], cols)

    def get_first_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.first(col).alias(f"first_{col}") for col in target_cols], cols)

    def get_mean_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.mean(col).alias(f"mean_{col}") for col in target_cols], cols)

    def get_mode_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.col(col).mode().first().alias(f"mode_{col}") for col in target_cols], cols)

    def get_median_cols(df, target_cols, cols):
        return Aggregator.filter_columns([pl.median(col).alias(f"median{col}") for col in target_cols], cols)

In [18]:
import gc
from pathlib import Path

import os
import kagglehub

import time

# kagglehub.model_download("josh9191/homecredit/other/lightgbm")
# kagglehub.model_download("josh9191/homecredit/other/catboost")

len_lightgbm_models = 3
lightgbm_model_paths = [f'/kaggle/input/homecredit/other/lightgbm/3/best_model_lightgbm{i}' for i in range(1, len_lightgbm_models + 1)]
lightgbm_preprocess_json_paths = [f'/kaggle/input/homecredit/other/lightgbm/3/preprocess_lightgbm{i}' for i in range(1, len_lightgbm_models + 1)]
lightgbm_models = [LightGBM() for _ in range(len_lightgbm_models)]
for i in range(len_lightgbm_models):
    lightgbm_models[i].load(lightgbm_model_paths[i], lightgbm_preprocess_json_paths[i])
    
len_catboost_models = 3
catboost_model_paths = [f'/kaggle/input/homecredit/other/catboost/1/best_model_catboost{i}' for i in range(1, len_catboost_models + 1)]
catboost_preprocess_json_paths = [f'/kaggle/input/homecredit/other/catboost/1/preprocess_catboost{i}' for i in range(1, len_catboost_models + 1)]
catboost_models = [CatBoost() for _ in range(len_catboost_models)]
for i in range(len_catboost_models):
    catboost_models[i].load(catboost_model_paths[i], catboost_preprocess_json_paths[i])

columns = list(lightgbm_models[0].model.booster_.feature_name())

ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet", None, columns),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet", None, columns),
        read_files(TEST_DIR / "test_static_0_*.parquet", None, columns),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1, columns),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1, columns),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_other_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_person_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1, columns),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1, columns),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2, columns),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2, columns),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2, columns),
        read_file(TEST_DIR / "test_person_2.parquet", 2, columns)
    ]
}

df_test = feature_eng(**data_store)

print("test data shape:\t", df_test.shape)
del data_store
gc.collect()

df_test = to_pandas(df_test).reset_index(drop=False)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test = reduce_mem_usage(df_test)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


test data shape:	 (10, 505)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.03 MB
Decreased by 31.0%


In [19]:
df_test_selected = df_test[['case_id'] + columns].copy()
del df_test
gc.collect()
case_id = df_test_selected['case_id']

test_input_df = df_test_selected.drop(columns=['case_id'])
preds = pd.Series(np.zeros(len(test_input_df)))
for i in range(len_lightgbm_models):
    p, _, _ = lightgbm_models[i].predict(test_input_df)
    preds = preds + p
    
for i in range(len_catboost_models):
    p, _, _ = catboost_models[i].predict(test_input_df)
    preds = preds + p

preds = preds / (len_lightgbm_models + len_catboost_models)

submission = pd.DataFrame({
    "case_id": case_id,
    "score": np.clip(np.nan_to_num(preds, nan=0.3), 0, 1)
}).groupby('case_id').mean()
# print(submission)
submission.to_csv("./submission.csv")