In [None]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

from pathlib import Path
import os
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import sklearn.metrics
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser, Namespace
import numpy as np
import pandas as pd
import warnings
import optuna

from pyarrow.dataset import dataset

warnings.filterwarnings("ignore")
BASE_PATH = Path(os.getcwd())
DATA_PATH = "/kaggle/input/home-credit-credit-risk-model-stability/"
SEED = 617

import os
import pandas as pd
from pathlib import Path
from argparse import Namespace
from dataclasses import dataclass
from pyarrow.parquet import ParquetFile

POSTFIXES = {
    "P": "Transform DPD (Days Past Due)",
    "M": "Masking Categories",
    "A": "Transform Amount",
    "D": "Transform Date",
    "T": "Unspecified Transform",
    "L": "Unspecified Transform",
}


class RawFile:
    def __init__(self, file_name: str = "") -> None:
        self.file_name = str(file_name)

        if isinstance(self.file_name, str) and self.file_name:
            (
                self._type,
                self._name,
                self._depth,
                self._index,
                self._file_format
            ) = self._parse_file_name()
        else:
            raise ValueError(f"file_name should be a non-empty string. Not {file_name}.")

    def __repr__(self) -> str:
        return f"{self.file_name}"

    def __str__(self) -> str:
        return self.file_name

    def __lt__(self, other) -> bool:
        return self.file_name < other.file_name

    @property
    def type(self) -> str:
        return self._type
    
    @property
    def depth(self) -> str:
        return self._depth
    
    @property
    def index(self) -> str:
        return self._index
    
    @property
    def format(self) -> str:
        return self._file_format
    
    @property
    def name(self) -> str:
        return self._name

    @property
    def fullname(self) -> str:
        return self.file_name.rsplit(".", 1)[0]

    def _parse_file_name(self) -> tuple[str, str, str, str]:
        fullname = self.fullname
        file_format = self.file_name.rsplit(".", 1)[1]
        
        names = fullname.split("_")
        if names[-2].isdigit():
            return names[0], "_".join(names[1:-2]), names[-2], names[-1], file_format
        elif names[-1].isdigit():
            return names[0], "_".join(names[1:-1]), names[-1], "", file_format
        else:
            return names[0], "_".join(names[1:]), "", "", file_format

    def get_path(self, data_dir: Path = None) -> Path:
        if data_dir is None:
            data_dir = DATA_PATH
        return Path(data_dir) / f"{self.format}_files" / self.type / self.file_name
    
    def startswith(self, keyword: str) -> bool:
        return self.file_name.startswith(keyword)


@dataclass
class ColInfo:
    name: str

    def __repr__(self) -> str:
        return self.name

    def __str__(self) -> str:
        return self.name

    @property
    def desc(self) -> str:
        return self.describe()

    def describe(
            self,
            description_file: str = "feature_definitions.csv"
    ) -> str:
        description_df = pd.read_csv(DATA_PATH / description_file, usecols=["Variable", "Description"])

        if self.name in description_df["Variable"].values:
            result_description = (
                description_df.loc[
                    description_df["Variable"] == self.name, "Description"
                ].values[0])
        else:
            result_description = self.name

        if self.name[-1] in POSTFIXES:
            return f"{self.name}: {result_description} ({POSTFIXES[self.name[-1]]})"
        else:
            return f"{self.name}: {result_description}"


class RawInfo:
    VALID_TYPES = ["", "train", "test"]
    VALID_DEPTHS = ["", "0", "1", "2"]

    def __init__(self, config: dict = None) -> None:
        self.config = config
        if self.config is None:
            self.config = Namespace(**{
                "data_path": DATA_PATH,
                "raw_format": "parquet",
            })

        self.format = self.config.raw_format
        self.data_dir_path = Path(self.config.data_path)
        self.file_dir_path = self.data_dir_path / f"{self.format}_files"

        if not self.file_dir_path.exists():
            raise FileNotFoundError(f"{self.file_dir_path} does not exist.")
        
        self.reader = RawReader(self.format)

    def show_files(self, type_: str = "train") -> list[RawFile]:
        return sorted([RawFile(f) for f in os.listdir(self.file_dir_path / type_)])

    def get_files(self, filename: str, *, depth: int = None, type_: str = "train") -> list[RawFile]:
        if depth is None:
            return sorted([
                f for f in self.show_files(type_) if f.name == filename])
        else:
            return sorted([
                f for f in self.show_files(type_)
                if f.name == filename and f.depth == str(depth)])

    def get_depths_by_name(self, file_name: str, type_: str = "train") -> list[int]:
        return sorted(list(set([int(f.depth) for f in self.get_files(file_name, type_=type_)])))

    def get_files_by_depth(self, depth: int, type_: str = "train") -> list[RawFile]:
        return [f for f in self.show_files(type_) if f.depth == str(depth)]

    def read_raw(
        self,
        file_name: str,
        *,
        depth: int = None,
        type_: str = "train",
    ) -> pd.DataFrame:
        raw_files = self.get_files(file_name, depth=depth, type_=type_)

        if len(raw_files) > 0:
            raw_df = pd.concat([self.reader(rf.get_path(self.data_dir_path)) for rf in raw_files])
        else:
            raise FileNotFoundError(f"{file_name} (depth: {depth}) does not exist in {type_} files.")

        return raw_df


class RawReader:
    def __init__(self, format_: str = "parquet") -> None:
        self.format = format_
        
        if format_ == "parquet":
            self.reader = pd.read_parquet
            self.column_getter = self._get_parquet_columns
        elif format_ == "csv":
            self.reader = pd.read_csv
            self.column_getter = self._get_csv_columns
        else:
            raise ValueError(f"format_ should be either 'parquet' or 'csv'. Not {format_}.")

    def read(self, file_path: Path) -> pd.DataFrame:
        return self.reader(file_path)

    def columns(self, file_path: Path) -> list[ColInfo]:
        return [ColInfo(c) for c in self.column_getter(file_path)]

    def _get_csv_columns(self, file_path: Path) -> list[ColInfo]:
        return [c for c in self.reader(file_path, nrows=0).columns]

    def _get_parquet_columns(self, file_path: Path) -> list[ColInfo]:
        return [c for c in ParquetFile(file_path).columns]

    def __call__(self, file_path) -> pd.DataFrame:
        return self.read(file_path)
def get_config():
    base_path = os.getcwd()
    data_path = os.path.join(base_path, "home-credit-credit-risk-model-stability")

    parser = ArgumentParser()
    parser.add_argument("--base_path", type=str, default=base_path)
    parser.add_argument("--data_path", type=str, default=data_path)
    parser.add_argument("--raw_format", type=str, default="parquet")

    return parser.parse_args()


def prepare_base_data(conf: Namespace = None, type_: str = "train"):
    print("prepare_base_data ...")
    infos = RawInfo(conf)
    base_df = infos.read_raw("base", type_=type_)
    static_df = infos.read_raw("static", depth=0, type_=type_)
    static_cb_df = infos.read_raw("static_cb", depth=0, type_=type_)

    joined_df = pd.merge(base_df, static_df, on="case_id", how="left", suffixes=("_base", "_static"))
    joined_df = pd.merge(joined_df, static_cb_df, on="case_id", how="left", suffixes=("", "_static_cb"))
    print(f"base shape: {base_df.shape} & static shape: {static_df.shape} & static_cb shape: {static_cb_df.shape} & joined shape: {joined_df.shape}")

    return joined_df


def devval(df):
    conditions = [
        df["MONTH"].between(201909, 202008),
        df["MONTH"].between(201901, 201908)
    ]
    choices = [0, 1]
    df['devval'] = np.select(conditions, choices, default=2)


def get_tree_selector(
        df: pd.DataFrame,
        target: str,
        n_estimators: int = 10,
        max_features: int = None,
) -> SelectFromModel:
    print("select_features ...")
    X = df.drop(target, axis=1)
    y = df[target]

    clf = ExtraTreesClassifier(n_estimators=n_estimators)
    clf = clf.fit(X, y)
    model = SelectFromModel(clf, prefit=True, max_features=max_features)
    return model


def exclude_object_columns(df: pd.DataFrame) -> pd.DataFrame:
    print("exclude_object_columns ...")
    return df.select_dtypes(exclude=["object"])


def objective(trial, X, y):
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
    }

    """
        Trial 40 finished with value: 0.7972759296627593 and parameters: 
        {'lambda_l1': 1.2241636481438304e-05,
        'lambda_l2': 9.985408160774956,
        'num_leaves': 211,
        'feature_fraction': 0.5443119666214574,
        'bagging_fraction': 0.8414338881950802,
        'bagging_freq': 5,
        'min_child_samples': 70}. Best is trial 40 with value: 0.7972759296627593."""

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    auroc = sklearn.metrics.roc_auc_score(valid_y, preds)
    return auroc


def inference(selector, model, X):
    X_sel = selector.transform(
        exclude_object_columns(X)
        .fillna(-99999999)
        .drop("target", axis=1))
    
    pred = model.predict(X_sel)
    auroc = sklearn.metrics.roc_auc_score(X["target"], pred)
    return auroc

train_base_static = prepare_base_data()
test_base_static = prepare_base_data(type_="test")

devval(train_base_static)
dev = train_base_static[train_base_static["devval"] == 0].drop("devval", axis=1)
val = train_base_static[train_base_static["devval"] == 1].drop("devval", axis=1)
test = train_base_static[train_base_static["devval"] == 2].drop("devval", axis=1)

selector = get_tree_selector(
    exclude_object_columns(dev).fillna(-99999999), "target")

dev_t = selector.transform(
    exclude_object_columns(dev)
    .fillna(-99999999)
    .drop("target", axis=1))
val_t = selector.transform(
    exclude_object_columns(val)
    .fillna(-99999999)
    .drop("target", axis=1))
test_t = selector.transform(
    exclude_object_columns(test)
    .fillna(-99999999)
    .drop("target", axis=1))    
print(dev_t.shape, val_t.shape, test_t.shape)

eval_t = selector.transform(
    test_base_static[exclude_object_columns(dev).drop("target", axis=1).columns]
    .fillna(-99999999))
print(eval_t.shape)

study = optuna.create_study(direction="maximize")
study.optimize(
    lambda trial: objective(trial, dev_t, dev["target"]),
    n_trials=200,
    show_progress_bar=True)

print("Best trial:")
trial = study.best_trial

# retrain model with best params
best_params = trial.params
basic_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
}
basic_params.update(best_params)

dtrain = lgb.Dataset(dev_t, label=dev["target"])
best_model = lgb.train(basic_params, dtrain)

# inference
dev_auroc = inference(selector, best_model, dev)
print(f"dev auroc: {dev_auroc}")

val_auroc = inference(selector, best_model, val)
print(f"val auroc: {val_auroc}")

test_auroc = inference(selector, best_model, test)
print(f"test auroc: {test_auroc}")


pre_selector_columns = exclude_object_columns(dev).drop("target", axis=1).columns

eval_pred = best_model.predict(eval_t)
submission = pd.DataFrame({
    "case_id": test_base_static["case_id"].to_numpy(),
    "score": eval_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")