In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Work\DataFrames\Churn_Modelling.csv')

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, test = train_test_split(df,train_size=0.6,random_state=42,stratify=df['Exited'])

In [5]:
val, test = train_test_split(test,train_size=0.5,random_state=42,stratify=test['Exited'])

In [6]:
from catboost import CatBoostClassifier

In [7]:
X = ['CustomerId', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']

cat_features = ['Geography','Gender']

y = ['Exited']

In [8]:
from catboost import Pool

train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features
                 )

valid_data = Pool(data=val[X],
                  label=val[y],
                  cat_features=cat_features
                 )

test_data = Pool(data=test[X],
                  label=test[y],
                  cat_features=cat_features
                 )

In [9]:
params = {'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [10]:
model = CatBoostClassifier(**params)

In [11]:
model.fit(train_data,eval_set=valid_data)

0:	test: 0.8338545	best: 0.8338545 (0)	total: 189ms	remaining: 3m 8s
100:	test: 0.8725090	best: 0.8725090 (100)	total: 3.77s	remaining: 33.5s
200:	test: 0.8784656	best: 0.8784656 (200)	total: 7.23s	remaining: 28.7s
300:	test: 0.8812676	best: 0.8812676 (300)	total: 12.2s	remaining: 28.3s
400:	test: 0.8823868	best: 0.8824376 (390)	total: 17.2s	remaining: 25.7s
500:	test: 0.8834491	best: 0.8834522 (498)	total: 22.2s	remaining: 22.1s
600:	test: 0.8834737	best: 0.8836554 (564)	total: 27.3s	remaining: 18.1s
700:	test: 0.8837847	best: 0.8838694 (685)	total: 32.2s	remaining: 13.7s
800:	test: 0.8834383	best: 0.8838694 (685)	total: 37.3s	remaining: 9.26s
900:	test: 0.8829318	best: 0.8838694 (685)	total: 42.2s	remaining: 4.64s
999:	test: 0.8822744	best: 0.8838694 (685)	total: 47.2s	remaining: 0us

bestTest = 0.8838694083
bestIteration = 685

Shrink model to first 686 iterations.


<catboost.core.CatBoostClassifier at 0x19083a25f10>

In [12]:
n_iters = model.best_iteration_ + 1

In [13]:
n_iters

686

In [14]:
params = {'iterations':n_iters,
          'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [15]:
model = CatBoostClassifier(**params)

In [16]:
train_full = pd.concat([train,val])

In [17]:
train_full_data = Pool(train_full[X],
                       label=train_full[y],
                       cat_features=cat_features)

In [18]:
model.fit(train_full_data)

0:	total: 55.7ms	remaining: 38.2s
100:	total: 4.56s	remaining: 26.4s
200:	total: 8.07s	remaining: 19.5s
300:	total: 11.6s	remaining: 14.8s
400:	total: 15.2s	remaining: 10.8s
500:	total: 18.8s	remaining: 6.93s
600:	total: 23.9s	remaining: 3.38s
685:	total: 26.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x190827a8c70>

In [19]:
test['y_score_no_cross_val'] = model.predict_proba(test_data)[:,1]

In [20]:
from sklearn.metrics import roc_auc_score

In [21]:
roc_auc_score(test['Exited'],test['y_score_no_cross_val'])

0.8735839074822127

# Кросс-валидация

https://github.com/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb

In [22]:
from catboost import cv

In [23]:
params = {'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [24]:
cv_data = cv(
    params = params,
    pool = train_full_data,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    stratified=False,
    verbose=False
)

Training on fold [0/5]

bestTest = 0.866360207
bestIteration = 881

Training on fold [1/5]

bestTest = 0.8716621864
bestIteration = 998

Training on fold [2/5]

bestTest = 0.8695412245
bestIteration = 540

Training on fold [3/5]

bestTest = 0.8795945701
bestIteration = 942

Training on fold [4/5]

bestTest = 0.8586794872
bestIteration = 750



In [25]:
cv_data

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.818479,0.009480,0.685020,0.000209,0.684939,0.000262
1,1,0.823848,0.010334,0.677216,0.000374,0.677055,0.000503
2,2,0.831819,0.009265,0.668989,0.000472,0.668799,0.000751
3,3,0.834559,0.008771,0.662008,0.001326,0.661753,0.000689
4,4,0.834483,0.008218,0.654599,0.001429,0.654270,0.000898
...,...,...,...,...,...,...,...
995,995,0.868703,0.007809,0.329986,0.016873,0.284097,0.003644
996,996,0.868715,0.007781,0.329972,0.016875,0.284057,0.003631
997,997,0.868718,0.007804,0.329972,0.016876,0.284003,0.003636
998,998,0.868725,0.007777,0.329969,0.016855,0.283943,0.003613


In [26]:
cv_data[cv_data['test-AUC-mean'] == cv_data['test-AUC-mean'].max()]

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
741,741,0.868822,0.007457,0.330818,0.017005,0.297428,0.004326


In [27]:
n_iters = cv_data[cv_data['test-AUC-mean'] == cv_data['test-AUC-mean'].max()]['iterations'].values[0]

In [28]:
n_iters

741

In [29]:
params = {'iterations':n_iters,
          'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42,
          'learning_rate':0.01}

In [30]:
model = CatBoostClassifier(**params)

In [31]:
model.fit(train_full_data)

0:	total: 73.5ms	remaining: 54.4s
100:	total: 5.12s	remaining: 32.4s
200:	total: 9.17s	remaining: 24.6s
300:	total: 12.9s	remaining: 18.9s
400:	total: 16.6s	remaining: 14.1s
500:	total: 20s	remaining: 9.59s
600:	total: 23.6s	remaining: 5.49s
700:	total: 27.1s	remaining: 1.55s
740:	total: 28.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x19082eb0820>

In [32]:
test['y_score_cross_val'] = model.predict_proba(test_data)[:,1]

In [33]:
from sklearn.metrics import roc_auc_score

In [34]:
roc_auc_score(test['Exited'],test['y_score_cross_val'])

0.8737196364315009

# Подбор гиперпараметров

https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/2020_06_04_catboost_tutorial/catboost_features.ipynb

https://youtu.be/ZaP5qFSIcIw?t=2043

In [35]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'AUC',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 741,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.

In [36]:
params = {'verbose':100,
          'eval_metric':'AUC',
          'loss_function': 'Logloss',
          'random_seed':42}

In [38]:
model = CatBoostClassifier(**params)

In [39]:
grid = {'learning_rate': [0.01, 0.1],
        'depth': [5, 6]}

In [40]:
result = model.grid_search(grid, train_full_data, verbose=False)

0:	test: 0.7811641	best: 0.7811641 (0)	total: 45.9ms	remaining: 45.9s
100:	test: 0.8435462	best: 0.8435769 (99)	total: 4.36s	remaining: 38.8s
200:	test: 0.8508923	best: 0.8508923 (200)	total: 8.38s	remaining: 33.3s
300:	test: 0.8547897	best: 0.8547897 (300)	total: 11.8s	remaining: 27.3s
400:	test: 0.8569385	best: 0.8569385 (400)	total: 15.1s	remaining: 22.5s
500:	test: 0.8576410	best: 0.8576410 (500)	total: 18.5s	remaining: 18.4s
600:	test: 0.8585538	best: 0.8585538 (600)	total: 21.8s	remaining: 14.5s
700:	test: 0.8590923	best: 0.8591487 (694)	total: 25.2s	remaining: 10.8s
800:	test: 0.8593590	best: 0.8595564 (749)	total: 29s	remaining: 7.22s
900:	test: 0.8593282	best: 0.8595564 (749)	total: 32.9s	remaining: 3.62s
999:	test: 0.8593897	best: 0.8595974 (986)	total: 36.5s	remaining: 0us

bestTest = 0.8595974359
bestIteration = 986

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	test: 0.7811641	best: 

# Принципы sklearn

https://towardsdatascience.com/how-to-use-sklearn-pipelines-for-ridiculously-neat-code-a61ab66ca90d

# Предобработка и фичеинжениринг

https://www.kaggle.com/learn/feature-engineering

https://catboost.ai/en/docs/concepts/quantization

# Калибровка

https://www.kaggle.com/residentmario/notes-on-classification-probability-calibration

#  Catboost, тексты и эмбеддинги

https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/2020_06_04_catboost_tutorial/text_features.ipynb

https://youtu.be/ZaP5qFSIcIw?t=3802