*Vlad 3.11.2024*

In [None]:
import pandas as pd

import numpy as np

from catboost import CatBoostClassifier, CatBoostClassifier, Pool, cv as catboostCV
import xgboost as xgb
from lightgbm import LGBMClassifier, cv as lgbmCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from tqdm import tqdm
import torch

import glob
import os
import time

# SETTINGS
pd.set_option("display.max_columns", 600)
test_path = "/kaggle/input/train-one/test"

## All functions

In [None]:
# FUNCTIONS


def fast_auc(y_true, y_prob):
    """

    fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013

    """

    y_true = np.asarray(y_true)

    y_true = y_true[np.argsort(y_prob)]

    nfalse = 0

    auc = 0

    n = len(y_true)

    for i in range(n):

        y_i = y_true[i]

        nfalse += 1 - y_i

        auc += y_i * nfalse

    auc /= nfalse * (n - nfalse)

    return auc


def eval_auc_lgb(y_true, y_pred):
    """
    Fast auc eval function for lgb.
    """
    return "auc", fast_auc(y_true, y_pred), True


class CustomAUC(object):

    @staticmethod
    def get_auc(y_true, y_prob):
        """
        fast roc_auc computation: https://www.kaggle.com/c/microsoft-malware-prediction/discussion/76013
        """
        y_true = np.asarray(y_true)
        y_true = y_true[np.argsort(y_prob)]
        nfalse = 0
        auc = 0
        n = len(y_true)
        for i in range(n):
            y_i = y_true[i]
            nfalse += 1 - y_i
            auc += y_i * nfalse
        auc /= nfalse * (n - nfalse)
        return auc

    def is_max_optimal(self):
        return True  # greater is better

    def evaluate(self, approxes, target, weight):
        y_true = np.array(target).astype(int)
        approx = approxes[:, 1]

        score = self.get_auc(y_true, approx)
        return score, 1

    def get_final_error(self, error, weight):
        return error


def get_time():
    return time.ctime().replace(" ", "_").replace("__", "_").replace(":", "_")


def preprocess(data, is_test=False):
    cors_high = [
        "feature_424",
        "feature_285",
        "feature_141",
        "feature_285",
        "feature_48",
        "feature_178",
        "feature_285",
        "feature_381",
        "feature_239",
        "feature_158",
        "feature_103",
        "feature_208",
        "feature_9",
        "feature_428",
        "feature_323",
        "feature_410",
        "feature_98",
        "feature_285",
        "feature_9",
        "feature_379",
        "feature_495",
        "feature_323",
        "feature_98",
    ]
    data = data.drop(columns=cors_high)

    data["feature_47_div_feature_133"] = data.feature_47 / data.feature_133
    data["feature_47_div_feature_253"] = data["feature_47"] / data["feature_253"]
    data["feature_459_div_feature_166"] = data["feature_459"] / data["feature_166"]
    data["feature_173_div_feature_467"] = data["feature_173"] / data["feature_467"]
    data["feature_467_div_feature_343"] = data["feature_467"] / data["feature_343"]
    data["feature_201_div_feature_47"] = data["feature_201"] / data["feature_47"]
    data["feature_87_div_feature_492"] = data["feature_87"] / data["feature_492"]
    data["feature_201_div_feature_467"] = data["feature_201"] / data["feature_467"]
    data["feature_83_div_feature_47"] = data["feature_83"] / data["feature_47"]
    data["feature_423_div_feature_87"] = data["feature_423"] / data["feature_87"]

    need_take = [
        "feature_5",
        "feature_11",
        "feature_12",
        "feature_14",
        "feature_18",
        "feature_29",
        "feature_31",
        "feature_35",
        "feature_37",
        "feature_42",
        "feature_45",
        "feature_47",
        "feature_50",
        "feature_53",
        "feature_66",
        "feature_68",
        "feature_80",
        "feature_81",
        "feature_86",
        "feature_87",
        "feature_97",
        "feature_100",
        "feature_106",
        "feature_114",
        "feature_119",
        "feature_131",
        "feature_133",
        "feature_138",
        "feature_139",
        "feature_147",
        "feature_151",
        "feature_153",
        "feature_154",
        "feature_157",
        "feature_161",
        "feature_164",
        "feature_167",
        "feature_168",
        "feature_172",
        "feature_174",
        "feature_186",
        "feature_194",
        "feature_195",
        "feature_197",
        "feature_205",
        "feature_230",
        "feature_234",
        "feature_244",
        "feature_246",
        "feature_247",
        "feature_251",
        "feature_253",
        "feature_259",
        "feature_262",
        "feature_264",
        "feature_265",
        "feature_270",
        "feature_272",
        "feature_318",
        "feature_325",
        "feature_331",
        "feature_332",
        "feature_336",
        "feature_353",
        "feature_364",
        "feature_365",
        "feature_371",
        "feature_383",
        "feature_386",
        "feature_388",
        "feature_390",
        "feature_412",
        "feature_421",
        "feature_439",
        "feature_449",
        "feature_451",
        "feature_452",
        "feature_454",
        "feature_462",
        "feature_467",
        "feature_470",
        "feature_490",
        "feature_498",
        "feature_47_div_feature_133",
        "feature_47_div_feature_253",
        "feature_173_div_feature_467",
        "feature_201_div_feature_47",
    ]

    if "target" in data.columns:
        need_take.append("target")
    if is_test:
        need_take.append("id")

    data = data[need_take]
    return data


def train_model(X_train, X_val, X_test, y_train, y_val, model_type, model_params):
    model = None
    if model_type == "cat":
        model = CatBoostClassifier(eval_metric="AUC", **model_params)
        model.fit(train_set, eval_set=eval_set, use_best_model=True, verbose=200)

        train_set = Pool(data=X_train, label=y_train)
        eval_set = Pool(data=X_val, label=y_val)

        oof_prediction = model.predict_proba(X_val)
        test_prediction = model.predict_proba(X_test)

    elif model_type == "lgbm":
        model = LGBMClassifier(**model_params, n_jobs=-1)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric=eval_auc_lgb)

        oof_prediction = model.predict_proba(X_val)
        test_prediction = model.predict_proba(X_test, num_iteration=model.best_iteration_)

    elif model_type == "xgb":
        train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_train.columns)
        valid_data = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_train.columns)

        watchlist = [(train_data, "train"), (valid_data, "valid_data")]
        model = xgb.train(dtrain=train_data, num_boost_round=1000, evals=watchlist, params=model_params)

        oof_prediction = model.predict_proba(
            xgb.DMatrix(X_val, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit
        )
        test_prediction = model.predict_proba(
            xgb.DMatrix(X_test, feature_names=X_train.columns), ntree_limit=model.best_ntree_limit
        )

    else:
        raise RuntimeError("Wrong model_type is provided.\n")

    return oof_prediction, test_prediction, model


def save_model(model, model_type, file_idx, model_idx):
    try:
        if model_type == "cat":
            save_path = f"CATBOOST_{file_idx}_{model_idx}_{get_time}"
            model.save_model(save_path, format="cbm")
        elif model_type == "lgbm":
            save_path = f"LGMB_{file_idx}_{model_idx}_{get_time()}"
            model.save_model(save_path, num_iteration=model.best_iteration)
        elif model_type == "xgb":
            save_path = f"XGBOOST_{file_idx}_{model_idx}_{get_time()}"
            model.save_model(save_path)
    except:
        print(f"no save completed for model {model_type}_{model_idx}\tfile{file_idx}")
    finally:
        print(f"{get_time()} SAVE_MODEL COMPLETED")


def get_metadata_from_files(X_val, X_test, y_val, models_list, types_of_models=3, train_files=None):

    if any(not os.path.isfile(train_file) for train_file in train_files):
        raise RuntimeError("Provide correct train_files. " + "Current is {}".format(train_files))

    train_metadata = np.zeros((len(X_val), len(models_list) * types_of_models))
    test_metadata = np.zeros((len(X_test), len(models_list) * types_of_models))

    train_files = sorted(train_files)
    for file_idx, filename in enumerate(train_files):
        print("File: {}".format(filename))
        print("Started at: {}".format(time.ctime()))

        s = time.time()
        X_train = pd.read_csv(filename)
        e = time.time()
        print("{} to read a file".format(e - s))

        X_train = X_train.drop(columns=["smpl", "id"])
        X_train = preprocess(X_train)
        y_train = X_test["target"]

        try:
            X_train.pop("target")
        except:
            print("Target isn't dropped")

        for model_idx, (model_type, model_params) in enumerate(models_list):
            model = None
            # IT MUST BE PROBAS
            train_prediction, test_prediction, model = train_model(
                X_train, X_val, X_test, y_train, y_val, model_type, model_params
            )

            model_type_column = model_idx // types_of_models

            train_metadata[:, model_type_column] += train_prediction[:, 1]
            test_metadata[:, model_type_column] += test_prediction[:, 1]

            try:
                save_model(model, model_type, file_idx, model_idx)
            except Exception as e:
                print("Something went wrong with saving")
                print(e.what())
            finally:
                print("\tModel: {}_{}".format(models_list[model_idx][0], model_idx))

        del X_train

    train_metadata /= len(models_list // types_of_models)
    test_metadata /= len(models_list // types_of_models)

    return train_metadata, test_metadata


def predict_test(X_test, ids, model, test_path=test_path, name=None):
    if name is None:
        name = f"UNTITLED"

    dt = pd.DataFrame({"id": ids, "target": model.predict_proba(X_test)[:, 1]})

    dt.to_csv(f"submission_{name}_{get_time}.csv", index=False)

## Params

In [None]:
SEED = 42

SEEDS = [42, 1337, 407, 666, 1111]

catboost_params_0 = {
    "iterations": 2000,
    "verbose": False,
    "max_depth": 10,
    "loss_function": "Logloss",
    "custom_metric": CustomAUC(),
    "early_stopping_rounds": 100,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "random_state": SEEDS[0],
    "metric_period": 50,
    "bootstrap_type":"Poisson" if torch.cuda.is_available() else "Bernoulli",
    "subsample": 0.8
}
catboost_params_1 = {
    "iterations": 1000,
    "verbose": False,
    "max_depth": 10,
    "loss_function": "Logloss",
    "custom_metric": CustomAUC(),
    "random_seed": SEED,
    "early_stopping_rounds": 100,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "random_state": SEEDS[1],
    "metric_period": 50,
    "bootstrap_type":"Poisson" if torch.cuda.is_available() else "Bernoulli",
    "subsample": 0.8
}
catboost_params_2 = {
    "iterations": 2000,
    "verbose": False,
    "max_depth": 15,
    "loss_function": "Logloss",
    "custom_metric": CustomAUC(),
    "random_seed": SEED,
    "early_stopping_rounds": 100,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "random_state": SEEDS[2],
    "metric_period": 50,
    "bootstrap_type":"Poisson" if torch.cuda.is_available() else "Bernoulli",
    "subsample": 0.66
}
catboost_params_3 = {
    "iterations": 1000,
    "verbose": False,
    "max_depth": 5,
    "loss_function": "Logloss",
    "custom_metric": CustomAUC(),
    "early_stopping_rounds": 100,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "random_state": SEEDS[3],
    "metric_period": 50,
    # "bootstrap_type":"Poisson" if torch.cuda.is_available() else "Bernoulli",
    # "subsample": 0.8
}
catboost_params_4 = {
    "iterations": 1000,
    "verbose": False,
    "max_depth": 20,
    "loss_function": "Logloss",
    "custom_metric": CustomAUC(),
    "early_stopping_rounds": 100,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "random_state": SEEDS[4],
    "bagging_temperature": 1.0,
    "metric_period": 50
    # "bootstrap_type":"Poisson" if torch.cuda.is_available() else "Bernoulli",
    # "subsample": 0.8
}

lgbm_params_0 = {
    "seed": SEEDS[0], 
    "device_type": "gpu" if torch.cuda.is_available() else "cpu", 
    "objective": "binary", 
    "num_iterations": 500,
    "metric": "auc",
    "min_data_in_leaf": 50,
    "bagging_fraction": 0.8, 
    "feature_fraction": 0.8,
    "bagging_freq": 50
}

lgbm_params_1 = {
    "seed": SEEDS[1], 
    "device_type": "gpu" if torch.cuda.is_available() else "cpu", 
    "objective": "binary", 
    "num_iterations": 500,
    "metric": "auc",
    "min_data_in_leaf": 10,
    "bagging_fraction": 0.8, 
    "feature_fraction": 0.8,
    "bagging_freq": 50
}

lgbm_params_2 = {
    "seed": SEEDS[2], 
    "device_type": "gpu" if torch.cuda.is_available() else "cpu", 
    "objective": "binary", 
    "num_iterations": 1000,
    "metric": "auc",
    "min_data_in_leaf": 100,
    "bagging_fraction": 0.8, 
    "feature_fraction": 0.8,
    "bagging_freq": 30
}
lgbm_params_3 = {
    "seed": SEEDS[3], 
    "device_type": "gpu" if torch.cuda.is_available() else "cpu", 
    "objective": "binary", 
    "num_iterations": 500,
    "metric": "auc",
    "min_data_in_leaf": 50,
    "bagging_fraction": 0.8, 
    "feature_fraction": 0.8,
    "bagging_freq": 50
}

lgbm_params_4 = {
    "seed": SEEDS[4], 
    "device_type": "gpu" if torch.cuda.is_available() else "cpu", 
    "objective": "binary", 
    "num_iterations": 1000,
    "metric": "auc",
    "min_data_in_leaf": 50,
    "bagging_fraction": 0.8, 
    "feature_fraction": 0.8,
    "bagging_freq": 20
}

xgb_params_0 = {
    "early_stopping_rounds":100, 
    "verbose_eval": 200, 
    "device": "gpu" if torch.cuda.is_available() else "cpu", 
    "max_depth": 5,
    "subsample": 0.8, 
    "sampling_method": "gradient_based",
    "lambda": 5,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "seed": SEEDS[0]
}

xgb_params_1 = {
    "early_stopping_rounds":100, 
    "verbose_eval": 200, 
    "device": "gpu" if torch.cuda.is_available() else "cpu", 
    "max_depth": 5,
    "subsample": 0.8, 
    "sampling_method": "gradient_based",
    "lambda": 5,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "seed": SEEDS[1]
}

xgb_params_2 = {
    "early_stopping_rounds":100, 
    "verbose_eval": 200, 
    "device": "gpu" if torch.cuda.is_available() else "cpu", 
    "max_depth": 5,
    "subsample": 0.8, 
    "sampling_method": "gradient_based",
    "lambda": 5,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "seed": SEEDS[2]
}

xgb_params_3 = {
    "early_stopping_rounds":100, 
    "verbose_eval": 200, 
    "device": "gpu" if torch.cuda.is_available() else "cpu", 
    "max_depth": 5,
    "subsample": 0.8, 
    "sampling_method": "gradient_based",
    "lambda": 5,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "seed": SEEDS[3]
}

xgb_params_4 = {
    "early_stopping_rounds":100, 
    "verbose_eval": 200, 
    "device": "gpu" if torch.cuda.is_available() else "cpu", 
    "max_depth": 5,
    "subsample": 0.8,
    "lambda": 5,
    "eval_metric": "auc",
    "objective": "binary:logistic",
    "seed": SEEDS[4]
}

## Are params correct?

In [None]:
try:
    m = CatBoostClassifier(**catboost_params_0)
    m = CatBoostClassifier(**catboost_params_1)
    m = CatBoostClassifier(**catboost_params_2)
    m = CatBoostClassifier(**catboost_params_3)
    m = CatBoostClassifier(**catboost_params_4)

    m = LGBMClassifier(**lgbm_params_0)
    m = LGBMClassifier(**lgbm_params_1)
    m = LGBMClassifier(**lgbm_params_2)
    m = LGBMClassifier(**lgbm_params_3)
    m = LGBMClassifier(**lgbm_params_4)

    m = xgb.XGBClassifier(**xgb_params_0)
    m = xgb.XGBClassifier(**xgb_params_1)
    m = xgb.XGBClassifier(**xgb_params_2)
    m = xgb.XGBClassifier(**xgb_params_3)
    m = xgb.XGBClassifier(**xgb_params_4)
except:
    assert 1 == 0, "wtf... wrong params"

## Models to train: CatBoost, LGBM, XGB

In [None]:
models_list = [
    [
        ["cat", catboost_params_0],
        ["cat", catboost_params_1],
        ["cat", catboost_params_2],
        ["cat", catboost_params_3],
        ["cat", catboost_params_4],
        
        ["lgbm", lgbm_params_0],
        ["lgbm", lgbm_params_1],
        ["lgbm", lgbm_params_2],
        ["lgbm", lgbm_params_3],
        ["lgbm", lgbm_params_4],

        ["xgb", xgb_params_0],
        ["xgb", xgb_params_1],
        ["xgb", xgb_params_2],
        ["xgb", xgb_params_3],
        ["xgb", xgb_params_4]
    ]
]

## Validation:: train_10

## Test: concatenated test files

In [None]:

val_filename = "/kaggle/input/train-one/trainset/train_10.csv"

s = time.time()
X_val = pd.read_csv(val_filename)
y_val = X_val['target']

X_val = preprocess(X_val)
X_val = X_val.drop(columns=['smpl', 'id'])
e = time.time()

print("{} to preprocess a file {}".format(e - s, val_filename))


def read_test(test_path=test_path):
    X_test = None
    for test_file in os.listdir(test_path):
        if X_test is None:
            X_test = pd.read_csv(test_file)
        else:
            X_test = pd.concat((X_test, pd.read_csv(test_file)))
    
    return preprocess(X_test, is_test=True).drop(columns=['smpl'])

s = time.time()
X_test = read_test(test_path)
test_id = X_test.pop('id')

e = time.time()
print("{} to preprocess test files".format(e - s))

In [None]:
train_files = [
    "/kaggle/input/train-one/train_1.csv",
    "/kaggle/input/train-one/trainset/train_2.csv",
    "/kaggle/input/train-one/trainset/train_3.csv",
    "/kaggle/input/train-one/trainset/train_4.csv",
    "/kaggle/input/train-one/trainset/train_5.csv",
    "/kaggle/input/train-one/trainset/train_6.csv", 
    "/kaggle/input/train-one/trainset/train_7.csv", 
    "/kaggle/input/train-one/trainset/train_8.csv", 
    "/kaggle/input/train-one/trainset/train_9.csv"
]

train_metadata, test_metadata = get_metadata_from_files(
    X_val=X_val,
    y_val=y_val,
    models_list=models_list,
    train_files=train_files
)

In [None]:
train_metadata.shape, test_metadata.shape

In [None]:
print(X_val.shape, X_test.shape)
print(train_metadata.shape, test_metadata.shape)

In [None]:
test_id = X_test.pop('id')

# Predictions on metadata

## Catboost

In [None]:
from sklearn.metrics import roc_auc_score

catboost_cv = {
    "verbose": 100,
    "max_depth": [3, 5],
    "eval_metric": CustomAUC(),
    "use_best_model": True,
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "early_stopping_rounds": 100,
    'l2_leaf_reg': [5, 10, 50]
}

catboost_meta = CatBoostClassifier()
catboost_meta_params, cv_results = catboost_meta.randomized_search(train_metadata, y_val, cv=5, verbose=100, plot=True)['params']

cv_results

In [None]:
print("Best params for catboost metamodel\n")
print(catboost_meta_params)

predict_test(X_test, test_id, catboost_meta, name="catboost_metamodel")

## LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

logreg_params = {
    "max_iter": 3000, 
    "penalty": ["l2", "l1", "elasticnet", None],
    "tol": [1e-4], 
    "warm_start": True,
    "solver":["saga", "liblinear"]
}

logreg_meta = GridSearchCV(estimator=LogisticRegression, *logreg_params, verbose=1, scoring="roc-auc")

logreg_meta.fit(train_metadata, y_val)

logreg_metamodel = logreg_meta.best_estimator_

In [None]:
print("Best params for LogReg metamodel\n")
print(logreg_metamodel.get_params)

print("Logreg metamodel best score: {}".format(logreg_meta.best_score_))

predict_test(X_test, test_id, logreg_metamodel, name="logreg_metamodel")

# Initial data + model predictions


In [None]:
print(X_val.shape, X_test.shape)
X_OOF_train = np.hstack((X_val, train_metadata))
X_OOF_test = np.hstack((X_test, test_metadata))
print(X_OOF_train.shape, X_OOF_test.shape, X_OOF_test.shape)

## Catboost

In [None]:
from sklearn.metrics import roc_auc_score

catboost_cv = {
    "verbose": 100,
    "max_depth": [3, 5],
    "eval_metric": CustomAUC(),
    "task_type": "GPU" if torch.cuda.is_available() else "CPU",
    "early_stopping_rounds": 100,
    'l2_leaf_reg': [1, 10],
    "use_best_model": True
}

catboost_oof = CatBoostClassifier()
catboost_oof_best, cv_results = catboost_oof.randomized_search(X_OOF_train, y_val, cv=5, verbose=100, plot=True)

cv_results

# Submission

In [None]:
# # Путь к папке с тренировочными данными

# # path_train = '../train'



# # Путь к папке с тестовыми данными

# path_test = '../test'



# # Объединим тестовые данные в единый датасет test



# # Получим список путей к файлам в папке test

# filenames_test = glob.glob(path_test + "/*.csv")



# # Создадим список для записи считанных файлов test

# data_files_test = []



# def compression(filename, features=['target', 'smpl', 'id']):
#   # Переводим переменные в глобальную область видимости
#   global data, base_info, transformed_data, result
#   # Считываем файл данных
#   data = pd.read_csv(filename)
#   # Отделяем базовые данные
#   base_info = data[features]
#   # Возвращаем результат обработки
#   return result



# # Считаем и обработаем все файлы test, после чего добавим их в список

# for filename in filenames_test:

#   data_files_test.append(compression(filename, features=['smpl', 'id']))



# # Объединим тестовые данные в единый датасет

# test_data = pd.concat(data_files_test, ignore_index=True)



# # Выведем первые 10 строк тренировочных данных

# display(test_data.head(10))



# # Удостоверимся, что перед нами данные только из выборки test

# display(test_data['smpl'].value_counts())

In [None]:
# y_test_pred = gbdt_clf.predict_proba(test_data.drop('smpl', axis=1))



# # Переведем предсказание в формат Series

# y_test_pred = pd.Series(y_test_pred[:, 1])



# # Добавим данные предсказания к датасету

# test_data['target'] = y_test_pred



# # Сохраняем итоговые данные об id и предсказаниях в формате csv

# test_data[['id', 'target']].to_csv('baseline_submission_case2.csv', index=False)
