In [60]:
import shap
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import FeatureDrift

from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from catboost import (
    CatBoostRegressor,
    CatBoostClassifier,
    Pool, 
    cv, 
    EShapCalcType, 
    EFeaturesSelectionAlgorithm
)


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)

# Функции

### Функция для расчета метрики

In [61]:
def weighted_mean_absolute_error(y_true, y_pred, weights):

    '''

    Weighted mean absolute error.

    Parameters
    ----------
    y_true: ndarray
        Ground truth
    y_pred: ndarray
        Array of predictions

    Returns
    -------
    rmsle: float
        Weighted mean absolute error

    References
    ----------
    .. [1] https://kaggle-metrics.readthedocs.io/en/latest/_modules/kaggle_metrics/regression.html

    '''
    
    return (weights * np.abs(y_true - y_pred)).mean()

### Функция для обучения базовой модели

In [62]:
def fit_catboost_clf(features_train, 
                 features_val, 
                 target_train, 
                 target_val, 
                 cat_features, 
                 n_iter=None,
                 lr=None,
                 loss='MultiClass',
                 plot=True,
                 train_weight=None,
                 val_weight=None):
    
    pool_train = Pool(
        features_train,
        target_train,
        weight = train_weight,
        cat_features = cat_features
    )

    pool_val = Pool(
        features_val,
        target_val,
        weight = val_weight,
        cat_features = cat_features
    )

    model = CatBoostClassifier(
        iterations = n_iter,
        learning_rate = lr,
        loss_function = loss,
        random_seed = 42
    )

    model.fit(
        pool_train,
        eval_set=pool_val,
        verbose=100,
        early_stopping_rounds = 15,
        plot=plot
    )

    return model

def fit_catboost_reg(features_train, 
                 features_val, 
                 target_train, 
                 target_val, 
                 cat_features, 
                 n_iter=500,
                 train_weight=None,
                 val_weight=None):
    
    pool_train = Pool(
        features_train,
        target_train,
        weight = train_weight,
        cat_features = cat_features
    )

    pool_val = Pool(
        features_val,
        target_val,
        weight = val_weight,
        cat_features = cat_features
    )

    model = CatBoostRegressor(
        iterations= n_iter,
        learning_rate=0.05,
        loss_function='RMSE',
        random_seed = 42
    )

    model.fit(
        pool_train,
        eval_set=pool_val,
        verbose=100,
        early_stopping_rounds = 15,
        plot=True
    )

    return model

# Чтение данных

In [63]:
train_df = pd.read_csv('data/train.csv', sep=";", decimal=",", encoding="windows-1251")
test_df = pd.read_csv('data/test.csv', sep=";", decimal=",", encoding="windows-1251")

features_description = (
    pd.read_excel('data/description.xlsx', index_col='field')
    .to_dict()['description']
)

train_df.shape, test_df.shape

((205962, 235), (37183, 233))

# Предварительный анализ EDA

In [64]:
features = list(train_df.drop(['client_id', 'feature_date', 'target', 'w'], axis=1).columns)
train_df[features].select_dtypes('object').head()

Unnamed: 0,addrref,bki_total_ip_max_limit,hdb_bki_active_cc_cnt,hdb_bki_active_ip_max_outstand,hdb_bki_active_micro_max_outstand,hdb_bki_active_pil_max_overdue,hdb_bki_other_active_auto_month_payments_sum,hdb_bki_total_cc_max_limit,hdb_bki_total_ip_cnt,hdb_bki_total_max_limit,hdb_bki_total_max_overdue_sum,hdb_bki_total_pil_max_limit,main_last_position_ccode,main_pre_last_position_ccode,oldest_campaignsegment_ccode_for_nss,oldest_campaignsegment_ccode_for_pil,part_last_position_ccode,part_pre_last_position_ccode,segment,brand,model
0,Свердловская область,,2.0,1006774.6,,0.0,,170000.0,1.0,4250000.0,235937.0,220000.0,,,,PIL2CC,,,Mass Affluent,,
1,Новосибирская область,,6.0,,,0.0,,110000.0,0.0,183500.0,2527.0,183500.0,,,ACFYN_NZP,PIL2PIL,,,Mass Affluent,,
2,Новосибирская область,,4.0,,,0.0,,65000.0,0.0,558500.0,7678.0,558500.0,,,ACFYN_NZP,PIL2PIL,,,Mass Affluent,,
3,Московская область,,0.0,646694.0,,,,0.0,3.0,1780000.0,0.0,427000.0,,,ACFYN_NZP,PIL2PAY,,,Mass Affluent,,
4,Краснодарский край,,3.0,,,14264.13,,62500.0,0.0,1120000.0,14264.13,1120000.0,,,ACFYN_NZP,PIL2PAY,,,Mass,,


### Поправляем типы данных для фичей

In [65]:
print('Количество униклаьных значений по фиче:')
for f in train_df[features].select_dtypes('object').columns:
    print(f"{f}: {train_df[f].value_counts().count()}")

Количество униклаьных значений по фиче:
addrref: 59
bki_total_ip_max_limit: 2763
hdb_bki_active_cc_cnt: 60
hdb_bki_active_ip_max_outstand: 43267
hdb_bki_active_micro_max_outstand: 5458
hdb_bki_active_pil_max_overdue: 24365
hdb_bki_other_active_auto_month_payments_sum: 3533
hdb_bki_total_cc_max_limit: 9991
hdb_bki_total_ip_cnt: 15
hdb_bki_total_max_limit: 52263
hdb_bki_total_max_overdue_sum: 56162
hdb_bki_total_pil_max_limit: 54202
main_last_position_ccode: 9207
main_pre_last_position_ccode: 5192
oldest_campaignsegment_ccode_for_nss: 56
oldest_campaignsegment_ccode_for_pil: 13
part_last_position_ccode: 1236
part_pre_last_position_ccode: 1325
segment: 3
brand: 422
model: 2397


In [66]:
newtype_feats = [
    'bki_total_ip_max_limit',
    'hdb_bki_active_cc_cnt',
    'hdb_bki_active_ip_max_outstand',
    'hdb_bki_active_micro_max_outstand',
    'hdb_bki_active_pil_max_overdue',
    'hdb_bki_other_active_auto_month_payments_sum',
    'hdb_bki_total_cc_max_limit',
    'hdb_bki_total_ip_cnt',
    'hdb_bki_total_max_limit',
    'hdb_bki_total_max_overdue_sum',
    'hdb_bki_total_pil_max_limit'
]

test_df[newtype_feats] = test_df[newtype_feats].astype('float64')
test_df['feature_date'] = test_df['feature_date'].astype('datetime64[ns]')

train_df[newtype_feats] = train_df[newtype_feats].astype('float64')
train_df['feature_date'] = train_df['feature_date'].astype('datetime64[ns]')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205962 entries, 0 to 205961
Columns: 235 entries, client_id to productionyear
dtypes: datetime64[ns](1), float64(223), int64(1), object(10)
memory usage: 369.3+ MB


In [67]:
cat_features = list(train_df[features].select_dtypes('object').columns)
num_features = list(set(features) - set(cat_features))

len(features), len(cat_features), len(num_features)

(231, 10, 221)

### Анализ категорильных фичей

In [68]:
# сколько всего записей, где известна должность, но не известна зп?
train_df.loc[
    (train_df['worksalary_rur_amt'].isna())
    & (~train_df['part_last_position_ccode'].isna())
].shape

(4184, 235)

In [69]:
# df, содержащий медианные зарплаты для должностей по всем данным
median_worksalary = (
    pd.concat([train_df[features], test_df[features]], axis=0)
    .dropna(subset = 'worksalary_rur_amt')
    .groupby('part_last_position_ccode', as_index=False)
    ['worksalary_rur_amt'].median()
    .rename(columns={'worksalary_rur_amt': 'median_worksalary_rur_amt'})
)
median_worksalary.head()

Unnamed: 0,part_last_position_ccode,median_worksalary_rur_amt
0,АВТОМЕХАНИК,158892.5
1,АДМИНИСТРАТОР,180000.0
2,АНАЛИТИК,100000.0
3,Администратор,60000.0
4,Администратор магазина,70000.0


In [70]:
test_df = test_df.merge(median_worksalary, on='part_last_position_ccode', how='left')
test_df['worksalary_rur_amt'] = test_df['worksalary_rur_amt'].fillna(test_df['median_worksalary_rur_amt'])
test_df.drop('median_worksalary_rur_amt', axis=1, inplace=True)

train_df = train_df.merge(median_worksalary, on='part_last_position_ccode', how='left')
train_df['worksalary_rur_amt'] = train_df['worksalary_rur_amt'].fillna(train_df['median_worksalary_rur_amt'])
train_df.drop('median_worksalary_rur_amt', axis=1, inplace=True)

train_df.loc[
    (train_df['worksalary_rur_amt'].isna())
    & (~train_df['part_last_position_ccode'].isna())
].shape

(1029, 235)

In [71]:
# заполняем пропуски в категориальных
train_df[cat_features] = train_df[cat_features].fillna('miss_value')
test_df[cat_features] = test_df[cat_features].fillna('miss_value')

# Инженерия фич

In [72]:
train_df['feature_date_year'] = train_df['feature_date'].dt.year
train_df['feature_date_month'] = train_df['feature_date'].dt.month
train_df['feature_date_day'] = train_df['feature_date'].dt.day

test_df['feature_date_year'] = test_df['feature_date'].dt.year
test_df['feature_date_month'] = test_df['feature_date'].dt.month
test_df['feature_date_day'] = test_df['feature_date'].dt.day

In [73]:
features = list(train_df.drop(['client_id', 'feature_date', 'target', 'w'], axis=1).columns)

cat_features = list(train_df[features].select_dtypes('object').columns)
num_features = list(set(features) - set(cat_features))

len(features), len(cat_features), len(num_features)

(234, 10, 224)

# Пайплайн с границами 0.7 0.9

In [74]:
# классификатор
train_df['target_bins'] = None
train_df.loc[train_df['target'] <= train_df['target'].quantile(0.7), 'target_bins'] = 'bin_0'
train_df.loc[
    (train_df['target'] > train_df['target'].quantile(0.7))
    & (train_df['target'] <= train_df['target'].quantile(0.9)), 
    'target_bins'
] = 'bin_1'
train_df.loc[train_df['target'] > train_df['target'].quantile(0.9), 'target_bins'] = 'bin_2'

le = LabelEncoder()
train_df['target_bins'] = le.fit_transform(train_df['target_bins'])

features_train, features_val, target_train_clf, target_val_clf, target_train_reg, target_val_reg, w_train, w_val = train_test_split(
    train_df[features],
    train_df['target_bins'], 
    train_df['target'], 
    train_df['w'], 
    stratify=train_df['target_bins'],
    test_size=0.3,
    random_state=42
)

model = fit_catboost_clf(features_train, features_val, target_train_clf, target_val_clf, cat_features, 300, 0.05)
pred_val_clf = model.predict(features_val)[:,0]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0508377	test: 1.0510563	best: 1.0510563 (0)	total: 358ms	remaining: 1m 47s
100:	learn: 0.4987629	test: 0.5039129	best: 0.5039129 (100)	total: 36.8s	remaining: 1m 12s
200:	learn: 0.4698756	test: 0.4776805	best: 0.4776805 (200)	total: 1m 12s	remaining: 35.7s
299:	learn: 0.4541295	test: 0.4649459	best: 0.4649459 (299)	total: 1m 46s	remaining: 0us

bestTest = 0.4649458838
bestIteration = 299



In [82]:
# модель на нулевом бине
bin0_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 0], 
    features_val.loc[pred_val_clf == 0], 
    target_train_reg.loc[target_train_clf == 0], 
    target_val_reg.loc[pred_val_clf == 0], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 0], 
    bin0_model.predict(features_val.loc[pred_val_clf == 0]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin0_model.evals_result_['validation']['RMSE'][-1] - bin0_model.evals_result_['learn']['RMSE'][-1]}")

# модель на первом бине
bin1_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 1], 
    features_val.loc[pred_val_clf == 1], 
    target_train_reg.loc[target_train_clf == 1], 
    target_val_reg.loc[pred_val_clf == 1], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 1], 
    bin1_model.predict(features_val.loc[pred_val_clf == 1]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin1_model.evals_result_['validation']['RMSE'][-1] - bin1_model.evals_result_['learn']['RMSE'][-1]}")

# модель на втором бине
bin2_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 2], 
    features_val.loc[pred_val_clf == 2], 
    target_train_reg.loc[target_train_clf == 2], 
    target_val_reg.loc[pred_val_clf == 2], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 2], 
    bin2_model.predict(features_val.loc[pred_val_clf == 2]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin2_model.evals_result_['validation']['RMSE'][-1] - bin2_model.evals_result_['learn']['RMSE'][-1]}")

pred = pd.DataFrame(np.nan, index=target_val_reg.index, columns = ['pred'])
pred.loc[pred_val_clf == 0, 'pred'] = bin0_model.predict(features_val.loc[pred_val_clf == 0])
pred.loc[pred_val_clf == 1, 'pred'] = bin1_model.predict(features_val.loc[pred_val_clf == 1])
pred.loc[pred_val_clf == 2, 'pred'] = bin2_model.predict(features_val.loc[pred_val_clf == 2])

WMAE = weighted_mean_absolute_error(
    target_val_reg, 
    pred['pred'], 
    w_val
)
print(f"WMAE = {WMAE:.2f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 19324.2404085	test: 48656.4009928	best: 48656.4009928 (0)	total: 135ms	remaining: 53.7s
100:	learn: 14941.8385122	test: 46540.7311766	best: 46540.7311766 (100)	total: 12.1s	remaining: 35.9s
200:	learn: 14614.7583484	test: 46306.0400891	best: 46306.0400891 (200)	total: 24s	remaining: 23.8s
300:	learn: 14400.5105194	test: 46177.6697602	best: 46177.6697602 (300)	total: 35.5s	remaining: 11.7s
399:	learn: 14238.9336723	test: 46086.4993178	best: 46086.4993178 (399)	total: 47s	remaining: 0us

bestTest = 46086.49932
bestIteration = 399

WMAE = 17237.30
Overfit: 31847.565645531402


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 23569.7480054	test: 67114.7661900	best: 67114.7661900 (0)	total: 54.6ms	remaining: 21.8s
100:	learn: 21158.4015929	test: 65247.1767814	best: 65247.1767814 (100)	total: 5.97s	remaining: 17.7s
200:	learn: 20656.3824536	test: 64807.4055291	best: 64807.4055291 (200)	total: 11.9s	remaining: 11.7s
300:	learn: 20223.0065891	test: 64530.5311380	best: 64530.5311380 (300)	total: 17.5s	remaining: 5.76s
399:	learn: 19888.2776812	test: 64372.5088680	best: 64372.5088680 (399)	total: 23.5s	remaining: 0us

bestTest = 64372.50887
bestIteration = 399

WMAE = 40542.79
Overfit: 44484.231186812045


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 128904.3184923	test: 158789.3537295	best: 158789.3537295 (0)	total: 43.5ms	remaining: 17.4s
Stopped by overfitting detector  (15 iterations wait)

bestTest = 156939.118
bestIteration = 21

Shrink model to first 22 iterations.
WMAE = 159618.93
Overfit: 43553.240126636694
WMAE = 32396.59


# Пайплайн с границами 0.33 0.66

In [83]:
# классификатор
train_df['target_bins'] = None
train_df.loc[train_df['target'] <= train_df['target'].quantile(0.33), 'target_bins'] = 'bin_0'
train_df.loc[
    (train_df['target'] > train_df['target'].quantile(0.33))
    & (train_df['target'] <= train_df['target'].quantile(0.66)), 
    'target_bins'
] = 'bin_1'
train_df.loc[train_df['target'] > train_df['target'].quantile(0.66), 'target_bins'] = 'bin_2'

le = LabelEncoder()
train_df['target_bins'] = le.fit_transform(train_df['target_bins'])

features_train, features_val, target_train_clf, target_val_clf, target_train_reg, target_val_reg, w_train, w_val = train_test_split(
    train_df[features],
    train_df['target_bins'], 
    train_df['target'], 
    train_df['w'], 
    stratify=train_df['target_bins'],
    test_size=0.3,
    random_state=42
)

model = fit_catboost_clf(features_train, features_val, target_train_clf, target_val_clf, cat_features, 300, 0.05)
pred_val_clf = model.predict(features_val)[:,0]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.0753231	test: 1.0751621	best: 1.0751621 (0)	total: 357ms	remaining: 1m 46s
100:	learn: 0.7236774	test: 0.7291114	best: 0.7291114 (100)	total: 36.8s	remaining: 1m 12s
200:	learn: 0.6894176	test: 0.6971997	best: 0.6971997 (200)	total: 1m 13s	remaining: 36.1s
299:	learn: 0.6718855	test: 0.6827396	best: 0.6827396 (299)	total: 1m 50s	remaining: 0us

bestTest = 0.6827395623
bestIteration = 299



In [84]:
# модель на нулевом бине
bin0_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 0], 
    features_val.loc[pred_val_clf == 0], 
    target_train_reg.loc[target_train_clf == 0], 
    target_val_reg.loc[pred_val_clf == 0], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 0], 
    bin0_model.predict(features_val.loc[pred_val_clf == 0]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin0_model.evals_result_['validation']['RMSE'][-1] - bin0_model.evals_result_['learn']['RMSE'][-1]}")

# модель на первом бине
bin1_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 1], 
    features_val.loc[pred_val_clf == 1], 
    target_train_reg.loc[target_train_clf == 1], 
    target_val_reg.loc[pred_val_clf == 1], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 1], 
    bin1_model.predict(features_val.loc[pred_val_clf == 1]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin1_model.evals_result_['validation']['RMSE'][-1] - bin1_model.evals_result_['learn']['RMSE'][-1]}")

# модель на втором бине
bin2_model = fit_catboost_reg(
    features_train.loc[target_train_clf == 2], 
    features_val.loc[pred_val_clf == 2], 
    target_train_reg.loc[target_train_clf == 2], 
    target_val_reg.loc[pred_val_clf == 2], 
    cat_features, 
    400
)
WMAE = weighted_mean_absolute_error(
    target_val_reg.loc[pred_val_clf == 2], 
    bin2_model.predict(features_val.loc[pred_val_clf == 2]), 
    w_val
)
print(f"WMAE = {WMAE:.2f}")
print(f"Overfit: {bin2_model.evals_result_['validation']['RMSE'][-1] - bin2_model.evals_result_['learn']['RMSE'][-1]}")

pred = pd.DataFrame(np.nan, index=target_val_reg.index, columns = ['pred'])
pred.loc[pred_val_clf == 0, 'pred'] = bin0_model.predict(features_val.loc[pred_val_clf == 0])
pred.loc[pred_val_clf == 1, 'pred'] = bin1_model.predict(features_val.loc[pred_val_clf == 1])
pred.loc[pred_val_clf == 2, 'pred'] = bin2_model.predict(features_val.loc[pred_val_clf == 2])

WMAE = weighted_mean_absolute_error(
    target_val_reg, 
    pred['pred'], 
    w_val
)
print(f"WMAE = {WMAE:.2f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 11010.4070228	test: 47340.5857723	best: 47340.5857723 (0)	total: 76.8ms	remaining: 30.6s
100:	learn: 8992.3053510	test: 47096.2693937	best: 47095.6443308 (99)	total: 7.86s	remaining: 23.3s
Stopped by overfitting detector  (15 iterations wait)

bestTest = 47091.66385
bestIteration = 108

Shrink model to first 109 iterations.
WMAE = 15863.82
Overfit: 38160.45617317821


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 9340.2884737	test: 30909.4274444	best: 30909.4274444 (0)	total: 82.4ms	remaining: 32.9s
100:	learn: 8564.1837081	test: 30472.7258952	best: 30472.7258952 (100)	total: 7.79s	remaining: 23s
200:	learn: 8414.3126021	test: 30360.1645993	best: 30360.1645993 (200)	total: 15.4s	remaining: 15.3s
300:	learn: 8282.8065821	test: 30277.5911133	best: 30277.5911133 (300)	total: 23.3s	remaining: 7.66s
399:	learn: 8189.5140002	test: 30232.2402371	best: 30232.1742148 (398)	total: 30.8s	remaining: 0us

bestTest = 30232.17421
bestIteration = 398

Shrink model to first 399 iterations.
WMAE = 8803.73
Overfit: 22042.726236841838


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 110849.6729576	test: 117315.3716105	best: 117315.3716105 (0)	total: 74.3ms	remaining: 29.7s
100:	learn: 83930.5961857	test: 95708.8931489	best: 95708.8931489 (100)	total: 7.75s	remaining: 22.9s
200:	learn: 81223.0135117	test: 94359.0854982	best: 94359.0854982 (200)	total: 15.8s	remaining: 15.6s
300:	learn: 79149.9727652	test: 93518.7973551	best: 93518.7973551 (300)	total: 23.3s	remaining: 7.67s
399:	learn: 77678.9200626	test: 93048.5674225	best: 93047.1828429 (397)	total: 30.6s	remaining: 0us

bestTest = 93047.18284
bestIteration = 397

Shrink model to first 398 iterations.
WMAE = 65011.40
Overfit: 15369.647359837458
WMAE = 29703.44


### Вывод:
Модель сильнее переобучается на границах 0.7 и 0.9, поэтому итоговая метрика хуже