# Загрузка данных

In [None]:
! gdown --id 1Ym8EaWePP17RvrweKhWzCJmITXLgqvCZ

Downloading...
From: https://drive.google.com/uc?id=1Ym8EaWePP17RvrweKhWzCJmITXLgqvCZ
To: /content/churn-modeling.zip
100% 268k/268k [00:00<00:00, 60.4MB/s]


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/churn-modeling.zip')

# Обучающая, валидационная и тестовая выборка

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, train_size=0.6, random_state=42, stratify=df['Exited'])

In [None]:
val, test = train_test_split(test, train_size=0.5, random_state=42, stratify=test['Exited'])

# Обучение модели

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
X = ['CustomerId', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

cat_features = ['Geography','Gender']

y = ['Exited']

In [None]:
from catboost import Pool

train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features
                 )

valid_data = Pool(data=val[X],
                  label=val[y],
                  cat_features=cat_features
                 )

test_data = Pool(data=test[X],
                  label=test[y],
                  cat_features=cat_features
                 )

In [None]:
params = {'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(**params)

In [None]:
model.fit(train_data, eval_set=valid_data)

0:	test: 0.8338545	best: 0.8338545 (0)	total: 60ms	remaining: 59.9s
100:	test: 0.8725090	best: 0.8725090 (100)	total: 1.65s	remaining: 14.7s
200:	test: 0.8784656	best: 0.8784656 (200)	total: 3.32s	remaining: 13.2s
300:	test: 0.8812676	best: 0.8812676 (300)	total: 6.26s	remaining: 14.5s
400:	test: 0.8823868	best: 0.8824376 (390)	total: 9.15s	remaining: 13.7s
500:	test: 0.8834491	best: 0.8834522 (498)	total: 11.3s	remaining: 11.2s
600:	test: 0.8834737	best: 0.8836554 (564)	total: 13.6s	remaining: 9.03s
700:	test: 0.8837847	best: 0.8838694 (685)	total: 15.3s	remaining: 6.51s
800:	test: 0.8834383	best: 0.8838694 (685)	total: 17.2s	remaining: 4.28s
900:	test: 0.8829318	best: 0.8838694 (685)	total: 18.9s	remaining: 2.07s
999:	test: 0.8822744	best: 0.8838694 (685)	total: 20.8s	remaining: 0us

bestTest = 0.8838694083
bestIteration = 685

Shrink model to first 686 iterations.


<catboost.core.CatBoostClassifier at 0x78afe5383a60>

In [None]:
n_iters = model.best_iteration_ + 1

In [None]:
n_iters

686

In [None]:
params = {'iterations':n_iters,
          'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [None]:
model = CatBoostClassifier(**params)

In [None]:
train_full = pd.concat([train, val])

In [None]:
train_full_data = Pool(train_full[X],
                       train_full[y],
                       cat_features=cat_features)

In [None]:
model.fit(train_full_data)

0:	total: 32.1ms	remaining: 22s
100:	total: 1.69s	remaining: 9.81s
200:	total: 2.72s	remaining: 6.57s
300:	total: 3.76s	remaining: 4.8s
400:	total: 4.83s	remaining: 3.43s
500:	total: 5.86s	remaining: 2.16s
600:	total: 6.91s	remaining: 977ms
685:	total: 7.81s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78afe5382c80>

In [None]:
test['y_score_no_cross_val'] = model.predict_proba(test_data)[:,1]

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(test['Exited'], test['y_score_no_cross_val'])

0.8735839074822127

# Кросс-валидация

In [None]:
from catboost import cv

In [None]:
params = {
          'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [None]:
cv_data = cv(params=params,
             pool=train_full_data,
             fold_count=5,
             shuffle=True,
             partition_random_seed=0,
             stratified=False,
             verbose=False)

Training on fold [0/5]

bestTest = 0.866360207
bestIteration = 881

Training on fold [1/5]

bestTest = 0.8716621864
bestIteration = 998

Training on fold [2/5]

bestTest = 0.8695412245
bestIteration = 540

Training on fold [3/5]

bestTest = 0.8795945701
bestIteration = 942

Training on fold [4/5]

bestTest = 0.8586794872
bestIteration = 750



In [None]:
cv_data

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.818479,0.009480,0.685020,0.000209,0.684939,0.000262
1,1,0.823848,0.010334,0.677216,0.000374,0.677055,0.000503
2,2,0.831819,0.009265,0.668989,0.000472,0.668799,0.000751
3,3,0.834559,0.008771,0.662008,0.001326,0.661753,0.000689
4,4,0.834483,0.008218,0.654599,0.001429,0.654270,0.000898
...,...,...,...,...,...,...,...
995,995,0.868703,0.007809,0.329986,0.016873,0.284097,0.003644
996,996,0.868715,0.007781,0.329972,0.016875,0.284057,0.003631
997,997,0.868718,0.007804,0.329972,0.016876,0.284003,0.003636
998,998,0.868725,0.007777,0.329969,0.016855,0.283943,0.003613


In [None]:
cv_data[cv_data['test-AUC-mean'] == cv_data['test-AUC-mean'].max()]

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
741,741,0.868822,0.007457,0.330818,0.017005,0.297428,0.004326


In [None]:
n_iters = cv_data[cv_data['test-AUC-mean'] == cv_data['test-AUC-mean'].max()]['iterations'].values[0]

In [None]:
params = {'iterations':n_iters,
          'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [None]:
model = CatBoostClassifier(**params)

In [None]:
model.fit(train_full_data)

0:	total: 24.3ms	remaining: 18s
100:	total: 2.07s	remaining: 13.1s
200:	total: 3.9s	remaining: 10.5s
300:	total: 4.93s	remaining: 7.21s
400:	total: 5.97s	remaining: 5.07s
500:	total: 6.99s	remaining: 3.35s
600:	total: 8.05s	remaining: 1.88s
700:	total: 9.06s	remaining: 517ms
740:	total: 9.46s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78b035f5c190>

In [None]:
test['y_score_cross_val'] = model.predict_proba(test_data)[:,1]

In [None]:
roc_auc_score(test['Exited'], test['y_score_cross_val'])

0.8737196364315009

In [None]:
len(train_full)

8000

# Подбор гиперпараметров

In [None]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'AUC',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 741,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=

In [None]:
params = {'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42}

In [None]:
model = CatBoostClassifier(**params)

In [None]:
grid = {'learning_rate': [0.01, 0.1],
        'depth': [5, 6]}

In [None]:
result = model.grid_search(grid, train_full_data, verbose=False)

0:	test: 0.7811641	best: 0.7811641 (0)	total: 10.9ms	remaining: 10.9s
100:	test: 0.8435462	best: 0.8435769 (99)	total: 967ms	remaining: 8.6s
200:	test: 0.8508923	best: 0.8508923 (200)	total: 1.88s	remaining: 7.49s
300:	test: 0.8547897	best: 0.8547897 (300)	total: 2.8s	remaining: 6.5s
400:	test: 0.8569385	best: 0.8569385 (400)	total: 3.7s	remaining: 5.52s
500:	test: 0.8576410	best: 0.8576410 (500)	total: 4.96s	remaining: 4.94s
600:	test: 0.8585538	best: 0.8585538 (600)	total: 6.35s	remaining: 4.22s
700:	test: 0.8590923	best: 0.8591487 (694)	total: 7.7s	remaining: 3.28s
800:	test: 0.8593590	best: 0.8595564 (749)	total: 8.59s	remaining: 2.13s
900:	test: 0.8593282	best: 0.8595564 (749)	total: 9.49s	remaining: 1.04s
999:	test: 0.8593897	best: 0.8595974 (986)	total: 10.4s	remaining: 0us

bestTest = 0.8595974359
bestIteration = 986

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	test: 0.7811641	best: 0.7

In [None]:
result['params']

{'depth': 5, 'learning_rate': 0.1}

In [None]:
pd.DataFrame(result['cv_results'])['test-AUC-mean'].max()

0.8689660773549619

# Принципы sklearn