# Catboost

1. Построить ансамбль решающих деревьев используя catboost
2. Используя перекрестную проверку найти наилучшие параметры
3. Сделать предсказание и проверить качество через каппа-метрику

## Подключение библиотек

In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [3]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Предобработка данных

In [4]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)

In [5]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

In [6]:
data.fillna(value=-1, inplace=True)
data["Response"] = data["Response"] - 1
print(data.head())

   Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   2               1              10        0.076923               2   
1   5               1              26        0.076923               2   
2   6               1              26        0.076923               2   
3   7               1              10        0.487179               2   
4   8               1              26        0.230769               2   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0               1               1  0.641791  0.581818  0.148536  ...   
1               3               1  0.059701  0.600000  0.131799  ...   
2               3               1  0.029851  0.745455  0.288703  ...   
3               3               1  0.164179  0.672727  0.205021  ...   
4               3               1  0.417910  0.654545  0.234310  ...   

   Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48  Response  \
0                   0                   0            

## Набор столбцов для расчета

In [7]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Нормализация данных

In [8]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data, columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

In [9]:
data_transformed = reduce_mem_usage(data_transformed)
print(data_transformed.info())

Потребление памяти меньше на - 42.87 Мб (минус 75.1%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 126 entries, 0 to Response
dtypes: float16(125), int8(1)
memory usage: 14.2 MB
None


## Разделить данные

In [10]:
data_train, data_test = train_test_split(data_transformed, test_size=0.2)
print(data_train.head())

              0         1         2         3         4         5         6  \
41934 -1.735352 -1.445312  0.441162 -1.506836  0.611816 -0.169434 -1.159180   
11854 -0.325684  0.269287  0.970703 -0.529785 -1.634766 -0.169434  0.862305   
14522  0.073853  1.003906  0.895020 -0.475342  0.611816 -0.169434  0.862305   
58412  2.070312  2.228516 -1.148438  0.863770  0.611816 -0.169434 -1.159180   
31877 -0.748535 -0.465576 -1.527344 -0.645508  0.611816 -0.169434 -1.159180   

              7         8         9  ...       116      117       118  \
41934  1.100586 -1.156250  1.130859  ... -0.083679  0.44165 -0.149292   
11854 -1.013672  0.861816  0.100891  ... -0.083679  0.44165 -0.149292   
14522 -1.013672  0.887695 -0.928711  ... -0.083679  0.44165 -0.149292   
58412  1.100586 -1.156250  1.130859  ... -0.083679  0.44165 -0.149292   
31877  1.100586 -1.156250  1.130859  ... -0.083679  0.44165 -0.149292   

            119       120       121       122       123      124  Response  
41934  2.

## CatBoost

In [11]:
x = pd.DataFrame(data_train, columns=columns_transformed)
train_dataset = Pool(data=x, label=data_train["Response"])
model = CatBoostClassifier(
    iterations=10,
    learning_rate=0.57,
    random_seed=17,
    depth=6,
    loss_function="MultiClass",
    bootstrap_type="MVS",
    custom_metric="WKappa",
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [13]:
cb_params = {
    "depth": range(5, 8),
    "learning_rate": np.arange(0.56, 0.59, 0.01),
    "l2_leaf_reg": range(1, 5)
}

cb_grid = model.grid_search(cb_params, cv=5, X=x, y=data_train["Response"], verbose=True)


0:	learn: 1.5559356	test: 1.5634532	best: 1.5634532 (0)	total: 197ms	remaining: 1.77s
1:	learn: 1.4201472	test: 1.4225432	best: 1.4225432 (1)	total: 882ms	remaining: 3.53s
2:	learn: 1.3687114	test: 1.3760227	best: 1.3760227 (2)	total: 1.18s	remaining: 2.75s
3:	learn: 1.3459469	test: 1.3545408	best: 1.3545408 (3)	total: 1.6s	remaining: 2.4s
4:	learn: 1.3276797	test: 1.3390149	best: 1.3390149 (4)	total: 1.95s	remaining: 1.95s
5:	learn: 1.3001542	test: 1.3131762	best: 1.3131762 (5)	total: 2.35s	remaining: 1.57s
6:	learn: 1.2865240	test: 1.3026894	best: 1.3026894 (6)	total: 2.71s	remaining: 1.16s
7:	learn: 1.2755668	test: 1.2926171	best: 1.2926171 (7)	total: 3.19s	remaining: 798ms
8:	learn: 1.2673690	test: 1.2842377	best: 1.2842377 (8)	total: 3.54s	remaining: 393ms
9:	learn: 1.2556129	test: 1.2746311	best: 1.2746311 (9)	total: 3.91s	remaining: 0us

bestTest = 1.274631128
bestIteration = 9

0:	loss: 1.2746311	best: 1.2746311 (0)	total: 4.23s	remaining: 2m 28s
0:	learn: 1.5545658	test: 1.562

## Самые оптимальные параметры

In [14]:
print(cb_grid["params"])

{'depth': 7, 'l2_leaf_reg': 1, 'learning_rate': 0.56}


## Итоговая модель

In [15]:
model = CatBoostClassifier(
    iterations=100,
    learning_rate=cb_grid["params"]["learning_rate"],
    random_seed=17,
    depth=cb_grid["params"]["depth"],
    l2_leaf_reg=cb_grid["params"]["l2_leaf_reg"],
    loss_function="MultiClass",
    bootstrap_type="MVS",
    custom_metric="WKappa",
)

In [16]:
model.fit(train_dataset)

0:	learn: 1.5181500	total: 455ms	remaining: 45s
1:	learn: 1.4073457	total: 900ms	remaining: 44.1s
2:	learn: 1.3690236	total: 1.33s	remaining: 43.2s
3:	learn: 1.3010805	total: 1.92s	remaining: 46.1s
4:	learn: 1.2765627	total: 2.39s	remaining: 45.5s
5:	learn: 1.2573116	total: 2.93s	remaining: 45.9s
6:	learn: 1.2482722	total: 3.49s	remaining: 46.4s
7:	learn: 1.2306353	total: 4.05s	remaining: 46.6s
8:	learn: 1.2205870	total: 4.64s	remaining: 47s
9:	learn: 1.2110138	total: 5.31s	remaining: 47.8s
10:	learn: 1.2053498	total: 5.79s	remaining: 46.8s
11:	learn: 1.2032747	total: 5.98s	remaining: 43.9s
12:	learn: 1.1982522	total: 6.43s	remaining: 43s
13:	learn: 1.1923660	total: 6.89s	remaining: 42.4s
14:	learn: 1.1874685	total: 7.33s	remaining: 41.6s
15:	learn: 1.1847676	total: 7.76s	remaining: 40.7s
16:	learn: 1.1729201	total: 8.26s	remaining: 40.3s
17:	learn: 1.1686567	total: 8.72s	remaining: 39.7s
18:	learn: 1.1635419	total: 9.17s	remaining: 39.1s
19:	learn: 1.1515245	total: 9.66s	remaining: 38

<catboost.core.CatBoostClassifier at 0x2bd0cd60>

## Предсказание данных

In [17]:
x_test = pd.DataFrame(data_test, columns=columns_transformed)
data_test["target"] = model.predict(Pool(data=x_test))

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


## Оценка модели

In [18]:
print(f'CatBoost: {round(cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic"), 3)}')

CatBoost: 0.55
