### Import Libraries

In [13]:
import pandas as pd

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from catboost import cv

### Import Dataset

In [14]:
df = pd.read_csv("train.csv")
df.sample(5)

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
4567,1,17226,4828,117961,118343,118395,118890,125128,118398,118892
11358,1,43194,3527,117961,118225,120054,118702,303450,118704,118705
2083,1,33642,67940,117980,118076,117941,118568,136753,19721,118570
22555,0,15805,18211,117961,118386,119214,118321,117906,290919,118322
32374,0,34434,25276,117961,118343,121747,118321,117906,290919,118322


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ACTION            32769 non-null  int64
 1   RESOURCE          32769 non-null  int64
 2   MGR_ID            32769 non-null  int64
 3   ROLE_ROLLUP_1     32769 non-null  int64
 4   ROLE_ROLLUP_2     32769 non-null  int64
 5   ROLE_DEPTNAME     32769 non-null  int64
 6   ROLE_TITLE        32769 non-null  int64
 7   ROLE_FAMILY_DESC  32769 non-null  int64
 8   ROLE_FAMILY       32769 non-null  int64
 9   ROLE_CODE         32769 non-null  int64
dtypes: int64(10)
memory usage: 2.5 MB


In [16]:
df.describe()

Unnamed: 0,ACTION,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
count,32769.0,32769.0,32769.0,32769.0,32769.0,32769.0,32769.0,32769.0,32769.0,32769.0
mean,0.94211,42923.916171,25988.957979,116952.627788,118301.823156,118912.779914,125916.152644,170178.369648,183703.408893,119789.430132
std,0.233539,34173.892702,35928.03165,10875.563591,4551.588572,18961.322917,31036.465825,69509.46213,100488.407413,5784.275516
min,0.0,0.0,25.0,4292.0,23779.0,4674.0,117879.0,4673.0,3130.0,117880.0
25%,1.0,20299.0,4566.0,117961.0,118102.0,118395.0,118274.0,117906.0,118363.0,118232.0
50%,1.0,35376.0,13545.0,117961.0,118300.0,118921.0,118568.0,128696.0,119006.0,118570.0
75%,1.0,74189.0,42034.0,117961.0,118386.0,120535.0,120006.0,235280.0,290919.0,119348.0
max,1.0,312153.0,311696.0,311178.0,286791.0,286792.0,311867.0,311867.0,308574.0,270691.0


In [17]:
df = df.astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32769 entries, 0 to 32768
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   ACTION            32769 non-null  category
 1   RESOURCE          32769 non-null  category
 2   MGR_ID            32769 non-null  category
 3   ROLE_ROLLUP_1     32769 non-null  category
 4   ROLE_ROLLUP_2     32769 non-null  category
 5   ROLE_DEPTNAME     32769 non-null  category
 6   ROLE_TITLE        32769 non-null  category
 7   ROLE_FAMILY_DESC  32769 non-null  category
 8   ROLE_FAMILY       32769 non-null  category
 9   ROLE_CODE         32769 non-null  category
dtypes: category(10)
memory usage: 1.2 MB


**Features and Target**

In [18]:
# Features
X = df[df.columns.drop("ACTION")]

# Target
y = df["ACTION"]

In [19]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8]


**Split Data into Training set and Validation set**

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  train_size = 0.8,
                                                  stratify = y,
                                                  random_state = 110046)

### CatBoost

In [21]:
model = CatBoostClassifier(iterations = 5,
                           learning_rate = 0.1)

model.fit(X_train, y_train, 
          eval_set = (X_val, y_val),  
          cat_features=cat_features, 
          logging_level = "Silent")

print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())


Model is fitted: True
Model params:
{'iterations': 5, 'learning_rate': 0.1}


### Stdout of the training

In [22]:
model = CatBoostClassifier(iterations = 15,
                           verbose = 3)

model.fit(X_train,
          y_train,
          eval_set = (X_val, y_val),
          cat_features = cat_features)

Learning rate set to 0.441257
0:	learn: 0.4219513	test: 0.4215770	best: 0.4215770 (0)	total: 54.4ms	remaining: 761ms
3:	learn: 0.2321977	test: 0.2306211	best: 0.2306211 (3)	total: 192ms	remaining: 527ms
6:	learn: 0.1880665	test: 0.1795845	best: 0.1795845 (6)	total: 299ms	remaining: 342ms
9:	learn: 0.1769311	test: 0.1671757	best: 0.1671757 (9)	total: 407ms	remaining: 203ms
12:	learn: 0.1743685	test: 0.1640860	best: 0.1640860 (12)	total: 506ms	remaining: 77.8ms
14:	learn: 0.1728999	test: 0.1629131	best: 0.1629131 (14)	total: 570ms	remaining: 0us

bestTest = 0.1629131171
bestIteration = 14



<catboost.core.CatBoostClassifier at 0x1a8ea7565b0>

In [23]:
model = CatBoostClassifier(iterations = 300,
                           learning_rate = 0.1)

model.fit(X_train, 
          y_train,
          cat_features = cat_features,
          eval_set = (X_val, y_val))

0:	learn: 0.5799308	test: 0.5796008	best: 0.5796008 (0)	total: 95.2ms	remaining: 28.5s
1:	learn: 0.4900977	test: 0.4883784	best: 0.4883784 (1)	total: 158ms	remaining: 23.6s
2:	learn: 0.4240053	test: 0.4216274	best: 0.4216274 (2)	total: 221ms	remaining: 21.9s
3:	learn: 0.3758284	test: 0.3738422	best: 0.3738422 (3)	total: 251ms	remaining: 18.5s
4:	learn: 0.3394803	test: 0.3378025	best: 0.3378025 (4)	total: 267ms	remaining: 15.7s
5:	learn: 0.3090953	test: 0.3076762	best: 0.3076762 (5)	total: 329ms	remaining: 16.1s
6:	learn: 0.2885221	test: 0.2873239	best: 0.2873239 (6)	total: 356ms	remaining: 14.9s
7:	learn: 0.2728059	test: 0.2717894	best: 0.2717894 (7)	total: 374ms	remaining: 13.6s
8:	learn: 0.2544578	test: 0.2530435	best: 0.2530435 (8)	total: 440ms	remaining: 14.2s
9:	learn: 0.2381071	test: 0.2368361	best: 0.2368361 (9)	total: 499ms	remaining: 14.5s
10:	learn: 0.2253750	test: 0.2236482	best: 0.2236482 (10)	total: 580ms	remaining: 15.2s
11:	learn: 0.2152026	test: 0.2126215	best: 0.212621

95:	learn: 0.1498811	test: 0.1414151	best: 0.1414030 (94)	total: 8.58s	remaining: 18.2s
96:	learn: 0.1496904	test: 0.1414562	best: 0.1414030 (94)	total: 8.69s	remaining: 18.2s
97:	learn: 0.1495091	test: 0.1414251	best: 0.1414030 (94)	total: 8.8s	remaining: 18.1s
98:	learn: 0.1493181	test: 0.1413651	best: 0.1413651 (98)	total: 8.92s	remaining: 18.1s
99:	learn: 0.1493180	test: 0.1413651	best: 0.1413651 (98)	total: 8.95s	remaining: 17.9s
100:	learn: 0.1492154	test: 0.1413786	best: 0.1413651 (98)	total: 9.07s	remaining: 17.9s
101:	learn: 0.1489777	test: 0.1412350	best: 0.1412350 (101)	total: 9.18s	remaining: 17.8s
102:	learn: 0.1488604	test: 0.1411102	best: 0.1411102 (102)	total: 9.29s	remaining: 17.8s
103:	learn: 0.1486673	test: 0.1410052	best: 0.1410052 (103)	total: 9.38s	remaining: 17.7s
104:	learn: 0.1485380	test: 0.1409629	best: 0.1409629 (104)	total: 9.46s	remaining: 17.6s
105:	learn: 0.1483479	test: 0.1408753	best: 0.1408753 (105)	total: 9.57s	remaining: 17.5s
106:	learn: 0.1481554	

188:	learn: 0.1413248	test: 0.1398330	best: 0.1398330 (188)	total: 17.4s	remaining: 10.2s
189:	learn: 0.1412330	test: 0.1397573	best: 0.1397573 (189)	total: 17.5s	remaining: 10.1s
190:	learn: 0.1411973	test: 0.1397740	best: 0.1397573 (189)	total: 17.6s	remaining: 10s
191:	learn: 0.1411067	test: 0.1397457	best: 0.1397457 (191)	total: 17.7s	remaining: 9.96s
192:	learn: 0.1410790	test: 0.1397378	best: 0.1397378 (192)	total: 17.8s	remaining: 9.87s
193:	learn: 0.1410716	test: 0.1397483	best: 0.1397378 (192)	total: 17.9s	remaining: 9.78s
194:	learn: 0.1409758	test: 0.1398131	best: 0.1397378 (192)	total: 18s	remaining: 9.68s
195:	learn: 0.1408989	test: 0.1398195	best: 0.1397378 (192)	total: 18.1s	remaining: 9.59s
196:	learn: 0.1408394	test: 0.1398219	best: 0.1397378 (192)	total: 18.2s	remaining: 9.5s
197:	learn: 0.1408194	test: 0.1398013	best: 0.1397378 (192)	total: 18.3s	remaining: 9.4s
198:	learn: 0.1407400	test: 0.1398182	best: 0.1397378 (192)	total: 18.4s	remaining: 9.32s
199:	learn: 0.14

280:	learn: 0.1339462	test: 0.1398794	best: 0.1397378 (192)	total: 26.4s	remaining: 1.78s
281:	learn: 0.1338490	test: 0.1398082	best: 0.1397378 (192)	total: 26.5s	remaining: 1.69s
282:	learn: 0.1338132	test: 0.1397866	best: 0.1397378 (192)	total: 26.6s	remaining: 1.59s
283:	learn: 0.1337190	test: 0.1397857	best: 0.1397378 (192)	total: 26.7s	remaining: 1.5s
284:	learn: 0.1336418	test: 0.1397642	best: 0.1397378 (192)	total: 26.7s	remaining: 1.41s
285:	learn: 0.1335853	test: 0.1397358	best: 0.1397358 (285)	total: 26.8s	remaining: 1.31s
286:	learn: 0.1335261	test: 0.1397683	best: 0.1397358 (285)	total: 26.9s	remaining: 1.22s
287:	learn: 0.1333306	test: 0.1397738	best: 0.1397358 (285)	total: 27s	remaining: 1.12s
288:	learn: 0.1331800	test: 0.1397923	best: 0.1397358 (285)	total: 27.1s	remaining: 1.03s
289:	learn: 0.1330617	test: 0.1396998	best: 0.1396998 (289)	total: 27.2s	remaining: 938ms
290:	learn: 0.1330260	test: 0.1397062	best: 0.1396998 (289)	total: 27.3s	remaining: 843ms
291:	learn: 0

<catboost.core.CatBoostClassifier at 0x1a8ea756130>

### Metrics calculation and graph plotting

In [24]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(
    iterations=50,
    random_seed=63,
    learning_rate=0.1,
    custom_loss=['Accuracy']
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1a8ea756880>

In [25]:
model1 = CatBoostClassifier(
    learning_rate=0.5,
    iterations=1000,
    random_seed=64,
    train_dir='learning_rate_0.5',
    custom_loss = ['Accuracy']
)

model2 = CatBoostClassifier(
    learning_rate=0.05,
    iterations=1000,
    random_seed=64,
    train_dir='learning_rate_0.05',
    custom_loss = ['Accuracy']
)
model1.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    verbose=100
)
model2.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    verbose=100
)

0:	learn: 0.3050682	test: 0.3049603	best: 0.3049603 (0)	total: 46.2ms	remaining: 46.2s
100:	learn: 0.1234646	test: 0.1480838	best: 0.1448732 (64)	total: 9.17s	remaining: 1m 21s
200:	learn: 0.0969472	test: 0.1549302	best: 0.1448732 (64)	total: 18.3s	remaining: 1m 12s
300:	learn: 0.0773468	test: 0.1613076	best: 0.1448732 (64)	total: 27.2s	remaining: 1m 3s
400:	learn: 0.0614010	test: 0.1674592	best: 0.1448732 (64)	total: 35.5s	remaining: 53s
500:	learn: 0.0503851	test: 0.1746078	best: 0.1448732 (64)	total: 43.6s	remaining: 43.5s
600:	learn: 0.0420236	test: 0.1789223	best: 0.1448732 (64)	total: 51.5s	remaining: 34.2s
700:	learn: 0.0359321	test: 0.1808323	best: 0.1448732 (64)	total: 59.7s	remaining: 25.5s
800:	learn: 0.0312890	test: 0.1858735	best: 0.1448732 (64)	total: 1m 7s	remaining: 16.7s
900:	learn: 0.0283441	test: 0.1869941	best: 0.1448732 (64)	total: 1m 15s	remaining: 8.26s
999:	learn: 0.0264753	test: 0.1892092	best: 0.1448732 (64)	total: 1m 22s	remaining: 0us

bestTest = 0.144873188

<catboost.core.CatBoostClassifier at 0x1a8ea9cae50>

In [26]:
from catboost import MetricVisualizer
MetricVisualizer(['learning_rate_0.05', 'learning_rate_0.5']).start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [27]:
model = CatBoostClassifier(
    iterations=40,
    save_snapshot=True,
    snapshot_file='snapshot.bkp',
    random_seed=43
)
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    logging_level='Verbose'
)

Learning rate set to 0.288002

bestTest = 0.1588461462
bestIteration = 38

Shrink model to first 39 iterations.


<catboost.core.CatBoostClassifier at 0x1a8ea9cad90>

In [28]:
print(model.predict(data=X_val))

[1 1 1 ... 1 1 1]


### Feature importances

In [29]:
model = CatBoostClassifier(
    iterations=300,
    max_ctr_complexity=4,
    random_seed=43
)
model.fit(
    X, y,
    cat_features=cat_features,
    verbose=50
)

Learning rate set to 0.137885
0:	learn: 0.5382281	total: 72.3ms	remaining: 21.6s
50:	learn: 0.1491560	total: 3.26s	remaining: 15.9s
100:	learn: 0.1420838	total: 6.88s	remaining: 13.6s
150:	learn: 0.1367733	total: 10.5s	remaining: 10.3s
200:	learn: 0.1328624	total: 14.3s	remaining: 7.03s
250:	learn: 0.1286074	total: 18s	remaining: 3.52s
299:	learn: 0.1251185	total: 21.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a8ea9ca6a0>

In [30]:
importances = model.get_feature_importance(prettified=True)
print(importances)

         Feature Id  Importances
0          RESOURCE    21.869747
1     ROLE_DEPTNAME    16.984904
2            MGR_ID    16.436590
3     ROLE_ROLLUP_2    11.469503
4  ROLE_FAMILY_DESC     9.798158
5        ROLE_TITLE     8.632741
6     ROLE_ROLLUP_1     6.254129
7         ROLE_CODE     4.344507
8       ROLE_FAMILY     4.209722


### Hyperparameter Tuning

In [31]:
fast_model = CatBoostClassifier(
    random_seed=63,
    iterations=150,
    learning_rate=0.01,
    boosting_type='Plain',
    bootstrap_type='Bernoulli',
    subsample=0.5,
    one_hot_max_size=20,
    rsm=0.5,
    leaf_estimation_iterations=5,
    max_ctr_complexity=1,
    border_count=32)

fast_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    logging_level='Silent',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1a8ea241400>

### Accuracy

In [33]:
tunned_model = CatBoostClassifier(
    random_seed=63,
    iterations=1000,
    learning_rate=0.03,
    l2_leaf_reg=3,
    bagging_temperature=1,
    random_strength=1,
    one_hot_max_size=2,
    leaf_estimation_method='Newton',
    depth=6
)
tunned_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    logging_level='Silent',
    eval_set=(X_val, y_val),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1a8ea241c10>