Обучить алгоритмы LightGBM и XGBoost, получить OOF прогнозы, оценить корреляцию прогнозов на обучающей выборке. Применить модели на тестовую выборку и оценить корреляцию.

Усреднить прогнозы с помощью арифмитического среднего, геометрического среднего и усреднить ранги, сделать выводы о качестве отдельных моделей и о качестве комбинации.

Обучить CatBoost, получить OOF прогнозы и выполнить задание 1 для трех моделей.
Выполнить задание 2 для трех моделей.

(опция) Объединить OOF-прогнозы для трех моделей и обучить алгоритм Логистической регрессии (и любой другой, на ваше усмотрение). Сделать выводы о достигаемом качестве, сравнить достигаемое качество с качеством отдельных моделей и моделей, полученных в п.2 и п.4.

(опция) Обучить алгоритмRandomForest (желательно подтюнить параметры) и добавить к построенным ранее моделям. Выполнить задание 5.

In [58]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

pd.set_option("display.max_columns", 30)

## Useful Functions

In [6]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "H:/PyProgects/gb_Competitive_Data_Analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [7]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

## Base Tables

In [8]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


## client_profile

In [9]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  overwrite_input=overwrite_input)


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,...,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,...,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [10]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

## baseline

In [11]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [59]:
class TargetEncoding(BaseEstimator, TransformerMixin):

    def __init__(self,
                 alpha: float = 0,
                 folds: int = 5,
                ):
        self.folds = folds
        self.alpha = alpha
        self.features = None
        self.cv = None

    def fit(self, X, y=None):
        self.features = {}
        self.cv = KFold(
            n_splits=self.folds, shuffle=True, random_state=27
        )
        global_mean = np.mean(y)

        for fold_number, (train_idx, valid_idx) in enumerate(self.cv.split(X, y), start=1):
            x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
            y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

            data = pd.DataFrame({"feature": x_train, "target": y_train})
            data = data.groupby(["feature"])["target"].agg([np.mean, np.size])
            data = data.reset_index()
            score = data["mean"] * data["size"] + global_mean * self.alpha
            score = score / (data["size"] + self.alpha)
            
            self.features[f"fold_{fold_number}"] = {
                key: value for key, value in zip(data["feature"], score)
            }

        return self

    def transform(self, X, y=None):
        check_is_fitted(self, "features")
        # TBD

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        x_transformed = X.copy(deep=True)

        for fold_number, (train_idx, valid_idx) in enumerate(self.cv.split(X, y), start=1):
            x_transformed.loc[valid_idx] = x_transformed.loc[valid_idx].map(
                self.features[f"fold_{fold_number}"]
            )
        return x_transformed

In [64]:
encoder = TargetEncoding(alpha=10)
for feature in categorial:
    train[f'num_{feature}'] = encoder.fit_transform(train[feature], target).astype('float64')


In [68]:
train.drop(categorial, axis=1, inplace=True)

## KFold

In [70]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=[]
)

Wed Dec 16 15:34:57 2020, Cross-Validation, 110093 rows, 52 cols
0:	test: 0.6075803	test1: 0.5924239	best: 0.5924239 (0)	total: 40.2ms	remaining: 1m 20s
10:	test: 0.7028604	test1: 0.6967245	best: 0.6971187 (9)	total: 457ms	remaining: 1m 22s
20:	test: 0.7054537	test1: 0.7003604	best: 0.7005032 (19)	total: 870ms	remaining: 1m 22s
30:	test: 0.7079809	test1: 0.7024714	best: 0.7024714 (30)	total: 1.29s	remaining: 1m 21s
40:	test: 0.7096953	test1: 0.7030945	best: 0.7033026 (35)	total: 1.69s	remaining: 1m 20s
50:	test: 0.7108289	test1: 0.7038224	best: 0.7044007 (45)	total: 2.09s	remaining: 1m 20s
60:	test: 0.7109947	test1: 0.7043801	best: 0.7047722 (59)	total: 2.5s	remaining: 1m 19s
70:	test: 0.7124290	test1: 0.7055164	best: 0.7055164 (70)	total: 2.91s	remaining: 1m 19s
80:	test: 0.7126908	test1: 0.7059055	best: 0.7059055 (80)	total: 3.34s	remaining: 1m 19s
90:	test: 0.7132368	test1: 0.7062170	best: 0.7063815 (89)	total: 3.75s	remaining: 1m 18s
100:	test: 0.7131543	test1: 0.7062286	best: 0.70

In [19]:
estimators[0].predict_proba(train)

array([[0.97937232, 0.02062768],
       [0.91751627, 0.08248373],
       [0.94069556, 0.05930444],
       ...,
       [0.91751627, 0.08248373],
       [0.96338491, 0.03661509],
       [0.92054879, 0.07945121]])

In [13]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72481


## Подготовка прогноза

In [14]:
y_pred = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]

In [15]:
 = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})
y_pred.to_csv("./geekbrains-competitive-data-analysis/baseline_submit.csv", index=False)

SyntaxError: invalid syntax (<ipython-input-15-e2796c5ff867>, line 1)

In [20]:
import xgboost as xgb

In [21]:
params = {
    "booster": "gblinear",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "reg_lambda": 100,
    "max_depth": 4,
    "gamma": 10,
    "nthread": 6,
    "seed": 27
}



In [None]:
model = xgb.XGBClassifier(**params)

cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators_xgboost, oof_preds_xgboost = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

# model.fit(
#     X=x_train,
#     y=y_train,
#     eval_set=[(x_train, y_train), (x_valid, y_valid)],
#     early_stopping_rounds=50,
#     eval_metric="auc",
#     verbose=10
# )

In [25]:
import catboost as cb

In [28]:
kf = KFold(n_splits=5)

In [71]:
estimators_catboost = []
oof_preds_catboost = []

cb_params = {
    "n_estimators": 2000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}
model = cb.CatBoostClassifier(**cb_params)
for train_index, test_index in kf.split(train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]

    model.fit(
        X=X_train,
        y=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)]
    )
    
    estimators_catboost.append(model)
    oof_preds_catboost.append(model.predict_proba(X_valid))

TRAIN: [ 22019  22020  22021 ... 110090 110091 110092] TEST: [    0     1     2 ... 22016 22017 22018]
0:	test: 0.5833974	test1: 0.5803869	best: 0.5803869 (0)	total: 39.3ms	remaining: 1m 18s
10:	test: 0.7003028	test1: 0.6907661	best: 0.6918914 (8)	total: 457ms	remaining: 1m 22s
20:	test: 0.7056419	test1: 0.6965231	best: 0.6969369 (19)	total: 856ms	remaining: 1m 20s
30:	test: 0.7088258	test1: 0.6992240	best: 0.6992240 (30)	total: 1.27s	remaining: 1m 20s
40:	test: 0.7114272	test1: 0.7022760	best: 0.7022760 (40)	total: 1.67s	remaining: 1m 19s
50:	test: 0.7123843	test1: 0.7022496	best: 0.7028276 (42)	total: 2.06s	remaining: 1m 18s
60:	test: 0.7136669	test1: 0.7037636	best: 0.7039035 (58)	total: 2.46s	remaining: 1m 18s
70:	test: 0.7152869	test1: 0.7057518	best: 0.7058914 (69)	total: 2.87s	remaining: 1m 17s
80:	test: 0.7164500	test1: 0.7062206	best: 0.7062206 (80)	total: 3.27s	remaining: 1m 17s
90:	test: 0.7181151	test1: 0.7074864	best: 0.7074864 (90)	total: 3.69s	remaining: 1m 17s
100:	test

In [24]:
import lightgbm as lgb

In [72]:


lgbm_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 750,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 42,
    'silent': -1,
    'verbose': -1
}

model = lgb.LGBMClassifier(**lgbm_params)

estimators_lgbm = []
oof_preds_lgbm = []
for train_index, test_index in kf.split(train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = train.iloc[train_index], train.iloc[test_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[test_index]

    model.fit(
        X=X_train,
        y=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=25,
        eval_metric="auc",
        verbose=10
    )
    
        
    estimators_lgbm.append(model)
    oof_preds_lgbm.append(model.predict_proba(X_valid))


TRAIN: [ 22019  22020  22021 ... 110090 110091 110092] TEST: [    0     1     2 ... 22016 22017 22018]
Training until validation scores don't improve for 25 rounds
[10]	training's auc: 0.725574	training's binary_logloss: 0.276791	valid_1's auc: 0.704704	valid_1's binary_logloss: 0.28266
[20]	training's auc: 0.727874	training's binary_logloss: 0.274252	valid_1's auc: 0.706072	valid_1's binary_logloss: 0.280582
[30]	training's auc: 0.729626	training's binary_logloss: 0.272008	valid_1's auc: 0.70735	valid_1's binary_logloss: 0.278756
[40]	training's auc: 0.731382	training's binary_logloss: 0.270048	valid_1's auc: 0.708159	valid_1's binary_logloss: 0.277182
[50]	training's auc: 0.732201	training's binary_logloss: 0.268259	valid_1's auc: 0.708417	valid_1's binary_logloss: 0.275746
[60]	training's auc: 0.733551	training's binary_logloss: 0.266624	valid_1's auc: 0.708711	valid_1's binary_logloss: 0.274458
[70]	training's auc: 0.735153	training's binary_logloss: 0.265104	valid_1's auc: 0.70947