In [1]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score,roc_curve

from catboost import CatBoostClassifier

from data_manager import DataManager
from tqdm import tqdm_notebook as tqdm
import pickle

In [2]:
def plot_roc_auc(model, X_train, X_val, y_train, y_val):
    y_train_pred = model.predict_proba(X_train)[:,1]
    print("Train ROC AUC:",roc_auc_score(y_train,y_train_pred))

    fpr,tpr,_ = roc_curve(y_train, y_train_pred)
    plt.plot(fpr,tpr,label='train AUC')

    y_val_pred = model.predict_proba(X_val)[:,1]
    print("Val ROC AUC:",roc_auc_score(y_val,y_val_pred))

    fpr,tpr,_ = roc_curve(y_val, y_val_pred)
    plt.plot(fpr,tpr,label='validation AUC')

    plt.plot([0,1],[0,1])
    plt.legend(loc='lower right')
    
def plot_feature_importances(model, X):
    order = np.argsort(model._feature_importance)
    plt.figure(figsize=[6,9])
    plt.plot(np.array(model._feature_importance)[order],range(len(order)),marker='o')
    plt.hlines(range(len(order)),np.zeros_like(order),np.array(model._feature_importance)[order],linestyles=':')
    plt.yticks(range(X.shape[1]),X.columns[order]);
    plt.tick_params(labelsize=16)
    plt.xlim([0.1,max(model._feature_importance)*1.5])
    plt.ylim(-1,len(order))
    plt.xscale('log') 

In [None]:
dm = DataManager(city_name='spb')

In [None]:
with open('dm_spb.pickle', 'wb') as f:
    pickle.dump(dm, f)

In [3]:
with open('dm_spb.pickle', 'rb') as f:
    dm = pickle.load(f)

In [4]:
X, y, block_ids = dm.X_train, dm.y_train, dm.train_block_ids

In [6]:
X = X.fillna(-999)

In [7]:
in_train = block_ids['hours_since'] <= np.percentile(block_ids['hours_since'], 85)  #leave last 15% for validation
X_train, y_train = X[in_train], y[in_train]
X_val, y_val = X[~in_train], y[~in_train]

In [8]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((28041, 73), (4762, 73), (28041,), (4762,))

In [9]:
model = CatBoostClassifier(iterations=2000,
                           depth=6,
                           loss_function='Logloss',
                           learning_rate=0.015,
                           thread_count=12,
                           use_best_model=True,
                           eval_metric='AUC',
                           random_seed=1,
                           verbose=True) \
            .fit(X_train, y_train, eval_set=(X_val, y_val))

Borders for float features generated
0:	learn 0.5759663286	test 0.5399139823	bestTest 0.5399139823		total: 88.2ms	remaining: 2m 56s
1:	learn 0.5865843208	test 0.5290183933	bestTest 0.5399139823		total: 127ms	remaining: 2m 6s
2:	learn 0.6072525222	test 0.5343802783	bestTest 0.5399139823		total: 162ms	remaining: 1m 48s
3:	learn 0.6136458818	test 0.5466863824	bestTest 0.5466863824		total: 200ms	remaining: 1m 39s
4:	learn 0.6353058657	test 0.5679106277	bestTest 0.5679106277		total: 233ms	remaining: 1m 33s
5:	learn 0.634475304	test 0.5677698348	bestTest 0.5679106277		total: 265ms	remaining: 1m 28s
6:	learn 0.66282094	test 0.5475562902	bestTest 0.5679106277		total: 299ms	remaining: 1m 25s
7:	learn 0.6626440435	test 0.5509301502	bestTest 0.5679106277		total: 330ms	remaining: 1m 22s
8:	learn 0.6647130079	test 0.5454126644	bestTest 0.5679106277		total: 361ms	remaining: 1m 19s
9:	learn 0.6643980952	test 0.5482475624	bestTest 0.5679106277		total: 392ms	remaining: 1m 17s
10:	learn 0.6644163031	tes

93:	learn 0.6946067426	test 0.553164504	bestTest 0.5679106277		total: 2.89s	remaining: 58.7s
94:	learn 0.6951150576	test 0.5540713141	bestTest 0.5679106277		total: 2.93s	remaining: 58.7s
95:	learn 0.694714991	test 0.5533624133	bestTest 0.5679106277		total: 2.96s	remaining: 58.7s
96:	learn 0.6949094349	test 0.5536599824	bestTest 0.5679106277		total: 3s	remaining: 58.8s
97:	learn 0.6951992545	test 0.5532763862	bestTest 0.5679106277		total: 3.03s	remaining: 58.8s
98:	learn 0.6979730951	test 0.5580158675	bestTest 0.5679106277		total: 3.06s	remaining: 58.7s
99:	learn 0.7001493932	test 0.5571367929	bestTest 0.5679106277		total: 3.09s	remaining: 58.7s
100:	learn 0.7007649572	test 0.5584243787	bestTest 0.5679106277		total: 3.12s	remaining: 58.7s
101:	learn 0.7017904286	test 0.5587478029	bestTest 0.5679106277		total: 3.15s	remaining: 58.6s
102:	learn 0.7019691714	test 0.5591516131	bestTest 0.5679106277		total: 3.18s	remaining: 58.5s
103:	learn 0.7022502173	test 0.5588465225	bestTest 0.567910627

184:	learn 0.7405190982	test 0.6074391335	bestTest 0.6074391335		total: 5.6s	remaining: 54.9s
185:	learn 0.7400310914	test 0.607048956	bestTest 0.6074391335		total: 5.63s	remaining: 54.9s
186:	learn 0.7413527171	test 0.6093627555	bestTest 0.6093627555		total: 5.66s	remaining: 54.8s
187:	learn 0.7416390984	test 0.6094981424	bestTest 0.6094981424		total: 5.69s	remaining: 54.8s
188:	learn 0.7415081537	test 0.6089706977	bestTest 0.6094981424		total: 5.72s	remaining: 54.8s
189:	learn 0.7420966344	test 0.6102155049	bestTest 0.6102155049		total: 5.75s	remaining: 54.8s
190:	learn 0.7424537811	test 0.6109991505	bestTest 0.6109991505		total: 5.78s	remaining: 54.7s
191:	learn 0.7429038646	test 0.6116065112	bestTest 0.6116065112		total: 5.8s	remaining: 54.7s
192:	learn 0.7436620264	test 0.6113662935	bestTest 0.6116065112		total: 5.83s	remaining: 54.6s
193:	learn 0.7441146166	test 0.611545399	bestTest 0.6116065112		total: 5.86s	remaining: 54.6s
194:	learn 0.7446588055	test 0.6119266448	bestTest 0.6

275:	learn 0.7765574804	test 0.6535999515	bestTest 0.6535999515		total: 8.3s	remaining: 51.9s
276:	learn 0.7765640014	test 0.653401102	bestTest 0.6535999515		total: 8.33s	remaining: 51.8s
277:	learn 0.7772640164	test 0.6546788158	bestTest 0.6546788158		total: 8.36s	remaining: 51.8s
278:	learn 0.7777470435	test 0.655113182	bestTest 0.655113182		total: 8.39s	remaining: 51.8s
279:	learn 0.7781115412	test 0.6552847661	bestTest 0.6552847661		total: 8.42s	remaining: 51.7s
280:	learn 0.778560964	test 0.6557957576	bestTest 0.6557957576		total: 8.45s	remaining: 51.7s
281:	learn 0.7784969229	test 0.6556302847	bestTest 0.6557957576		total: 8.48s	remaining: 51.7s
282:	learn 0.7788598792	test 0.6570654798	bestTest 0.6570654798		total: 8.51s	remaining: 51.7s
283:	learn 0.7794662461	test 0.6578260908	bestTest 0.6578260908		total: 8.54s	remaining: 51.6s
284:	learn 0.7795725466	test 0.6578768609	bestTest 0.6578768609		total: 8.57s	remaining: 51.6s
285:	learn 0.7798833689	test 0.657877331	bestTest 0.657

366:	learn 0.7981827252	test 0.6697763343	bestTest 0.6697763343		total: 11s	remaining: 48.7s
367:	learn 0.7982982906	test 0.6699112511	bestTest 0.6699112511		total: 11s	remaining: 48.7s
368:	learn 0.7985108747	test 0.6699596707	bestTest 0.6699596707		total: 11s	remaining: 48.7s
369:	learn 0.7989605009	test 0.6703559594	bestTest 0.6703559594		total: 11s	remaining: 48.6s
370:	learn 0.7992736436	test 0.6711602891	bestTest 0.6711602891		total: 11.1s	remaining: 48.6s
371:	learn 0.7998623784	test 0.6708650705	bestTest 0.6711602891		total: 11.1s	remaining: 48.6s
372:	learn 0.7999193905	test 0.6709398154	bestTest 0.6711602891		total: 11.1s	remaining: 48.5s
373:	learn 0.8002016221	test 0.6710140901	bestTest 0.6711602891		total: 11.2s	remaining: 48.5s
374:	learn 0.8001661039	test 0.6706887855	bestTest 0.6711602891		total: 11.2s	remaining: 48.5s
375:	learn 0.8003715742	test 0.6705891257	bestTest 0.6711602891		total: 11.2s	remaining: 48.4s
376:	learn 0.8005321767	test 0.670706179	bestTest 0.671160

456:	learn 0.8166584022	test 0.6778506579	bestTest 0.6778506579		total: 13.6s	remaining: 46s
457:	learn 0.8167927176	test 0.6778092897	bestTest 0.6778506579		total: 13.7s	remaining: 46s
458:	learn 0.8170691735	test 0.6780617298	bestTest 0.6780617298		total: 13.7s	remaining: 45.9s
459:	learn 0.8172391764	test 0.6781806635	bestTest 0.6781806635		total: 13.7s	remaining: 45.9s
460:	learn 0.8172743558	test 0.6781336541	bestTest 0.6781806635		total: 13.7s	remaining: 45.9s
461:	learn 0.81763572	test 0.6786855437	bestTest 0.6786855437		total: 13.8s	remaining: 45.8s
462:	learn 0.8177673253	test 0.6785332335	bestTest 0.6786855437		total: 13.8s	remaining: 45.8s
463:	learn 0.818188818	test 0.678782383	bestTest 0.678782383		total: 13.8s	remaining: 45.8s
464:	learn 0.8181879203	test 0.6786042176	bestTest 0.678782383		total: 13.9s	remaining: 45.7s
465:	learn 0.8182417989	test 0.6785435755	bestTest 0.678782383		total: 13.9s	remaining: 45.7s
466:	learn 0.8186894772	test 0.6790451652	bestTest 0.67904516

547:	learn 0.8311855807	test 0.6896034621	bestTest 0.6898032518		total: 16.3s	remaining: 43.2s
548:	learn 0.8314870026	test 0.6895982911	bestTest 0.6898032518		total: 16.3s	remaining: 43.2s
549:	learn 0.8316102238	test 0.6895197855	bestTest 0.6898032518		total: 16.4s	remaining: 43.1s
550:	learn 0.8316581573	test 0.6893688856	bestTest 0.6898032518		total: 16.4s	remaining: 43.1s
551:	learn 0.8316703524	test 0.6893858089	bestTest 0.6898032518		total: 16.4s	remaining: 43.1s
552:	learn 0.8319513814	test 0.6895428201	bestTest 0.6898032518		total: 16.5s	remaining: 43.1s
553:	learn 0.8324644728	test 0.690105992	bestTest 0.690105992		total: 16.5s	remaining: 43s
554:	learn 0.832624364	test 0.6900194948	bestTest 0.690105992		total: 16.5s	remaining: 43s
555:	learn 0.8328190958	test 0.6898667144	bestTest 0.690105992		total: 16.5s	remaining: 43s
556:	learn 0.8328500747	test 0.690277576	bestTest 0.690277576		total: 16.6s	remaining: 42.9s
557:	learn 0.832989573	test 0.6903786461	bestTest 0.6903786461	

638:	learn 0.8435454888	test 0.700186204	bestTest 0.700186204		total: 19s	remaining: 40.4s
639:	learn 0.8436774329	test 0.7002228713	bestTest 0.7002228713		total: 19s	remaining: 40.4s
640:	learn 0.8437210642	test 0.7001739816	bestTest 0.7002228713		total: 19s	remaining: 40.4s
641:	learn 0.8437913384	test 0.7001890245	bestTest 0.7002228713		total: 19.1s	remaining: 40.3s
642:	learn 0.8441145929	test 0.7001114591	bestTest 0.7002228713		total: 19.1s	remaining: 40.3s
643:	learn 0.8442215879	test 0.7000141498	bestTest 0.7002228713		total: 19.1s	remaining: 40.3s
644:	learn 0.8444449782	test 0.700055518	bestTest 0.7002228713		total: 19.2s	remaining: 40.3s
645:	learn 0.8445437414	test 0.6999629096	bestTest 0.7002228713		total: 19.2s	remaining: 40.2s
646:	learn 0.8447147098	test 0.7000649199	bestTest 0.7002228713		total: 19.2s	remaining: 40.2s
647:	learn 0.844804259	test 0.7000762021	bestTest 0.7002228713		total: 19.3s	remaining: 40.2s
648:	learn 0.8449506339	test 0.699969961	bestTest 0.70022287

729:	learn 0.8549936955	test 0.7035116444	bestTest 0.704048021		total: 21.6s	remaining: 37.6s
730:	learn 0.8552645111	test 0.7036512622	bestTest 0.704048021		total: 21.7s	remaining: 37.6s
731:	learn 0.8554093109	test 0.7037452809	bestTest 0.704048021		total: 21.7s	remaining: 37.6s
732:	learn 0.8555562955	test 0.7037485715	bestTest 0.704048021		total: 21.7s	remaining: 37.6s
733:	learn 0.8556731313	test 0.7036587837	bestTest 0.704048021		total: 21.8s	remaining: 37.5s
734:	learn 0.8557327856	test 0.7036512622	bestTest 0.704048021		total: 21.8s	remaining: 37.5s
735:	learn 0.8558114778	test 0.7036103641	bestTest 0.704048021		total: 21.8s	remaining: 37.5s
736:	learn 0.8558395942	test 0.7036305781	bestTest 0.704048021		total: 21.8s	remaining: 37.4s
737:	learn 0.8559446922	test 0.7038397696	bestTest 0.704048021		total: 21.9s	remaining: 37.4s
738:	learn 0.8559902374	test 0.7038125042	bestTest 0.704048021		total: 21.9s	remaining: 37.4s
739:	learn 0.8560854607	test 0.7038895995	bestTest 0.7040480

819:	learn 0.8664493208	test 0.7080950548	bestTest 0.7082586472		total: 24.3s	remaining: 35s
820:	learn 0.8665717121	test 0.7083587771	bestTest 0.7083587771		total: 24.3s	remaining: 34.9s
821:	learn 0.8667270132	test 0.7084631379	bestTest 0.7084631379		total: 24.4s	remaining: 34.9s
822:	learn 0.8668786218	test 0.7084936939	bestTest 0.7084936939		total: 24.4s	remaining: 34.9s
823:	learn 0.8669089571	test 0.7084626678	bestTest 0.7084936939		total: 24.4s	remaining: 34.9s
824:	learn 0.8669910538	test 0.7084175388	bestTest 0.7084936939		total: 24.5s	remaining: 34.8s
825:	learn 0.8672242679	test 0.7085209593	bestTest 0.7085209593		total: 24.5s	remaining: 34.8s
826:	learn 0.8673027229	test 0.7083968547	bestTest 0.7085209593		total: 24.5s	remaining: 34.8s
827:	learn 0.8674806696	test 0.7085684388	bestTest 0.7085684388		total: 24.5s	remaining: 34.7s
828:	learn 0.8675483523	test 0.7085519855	bestTest 0.7085684388		total: 24.6s	remaining: 34.7s
829:	learn 0.8676412552	test 0.7085905332	bestTest 0

910:	learn 0.8757721947	test 0.7115178046	bestTest 0.7117603727		total: 27s	remaining: 32.3s
911:	learn 0.8758811205	test 0.7116343877	bestTest 0.7117603727		total: 27s	remaining: 32.3s
912:	learn 0.8759813574	test 0.7119281961	bestTest 0.7119281961		total: 27.1s	remaining: 32.2s
913:	learn 0.8761547309	test 0.7120508904	bestTest 0.7120508904		total: 27.1s	remaining: 32.2s
914:	learn 0.8762264278	test 0.7117622531	bestTest 0.7120508904		total: 27.1s	remaining: 32.2s
915:	learn 0.8763061532	test 0.7111619439	bestTest 0.7120508904		total: 27.2s	remaining: 32.2s
916:	learn 0.8764348961	test 0.7110679252	bestTest 0.7120508904		total: 27.2s	remaining: 32.1s
917:	learn 0.8765696518	test 0.7110481813	bestTest 0.7120508904		total: 27.2s	remaining: 32.1s
918:	learn 0.8766666536	test 0.7109673252	bestTest 0.7120508904		total: 27.3s	remaining: 32.1s
919:	learn 0.8767501899	test 0.7110561729	bestTest 0.7120508904		total: 27.3s	remaining: 32s
920:	learn 0.8768598272	test 0.7109645047	bestTest 0.712

1001:	learn 0.8845842976	test 0.7140201115	bestTest 0.7140201115		total: 29.7s	remaining: 29.6s
1002:	learn 0.8847007098	test 0.7139463069	bestTest 0.7140201115		total: 29.7s	remaining: 29.5s
1003:	learn 0.8847899372	test 0.7140055386	bestTest 0.7140201115		total: 29.7s	remaining: 29.5s
1004:	learn 0.8848346526	test 0.7139124602	bestTest 0.7140201115		total: 29.8s	remaining: 29.5s
1005:	learn 0.8849638527	test 0.7140130601	bestTest 0.7140201115		total: 29.8s	remaining: 29.4s
1006:	learn 0.8850209156	test 0.7140248125	bestTest 0.7140248125		total: 29.8s	remaining: 29.4s
1007:	learn 0.8850915285	test 0.7140464368	bestTest 0.7140464368		total: 29.9s	remaining: 29.4s
1008:	learn 0.8851851258	test 0.7140685312	bestTest 0.7140685312		total: 29.9s	remaining: 29.4s
1009:	learn 0.8852737943	test 0.7140069489	bestTest 0.7140685312		total: 29.9s	remaining: 29.3s
1010:	learn 0.8852707116	test 0.7139542985	bestTest 0.7140685312		total: 29.9s	remaining: 29.3s
1011:	learn 0.8853102779	test 0.71389130

1092:	learn 0.891955212	test 0.7165369914	bestTest 0.716655925		total: 32.4s	remaining: 26.9s
1093:	learn 0.891995947	test 0.7166723783	bestTest 0.7166723783		total: 32.4s	remaining: 26.8s
1094:	learn 0.8920220818	test 0.716721268	bestTest 0.716721268		total: 32.4s	remaining: 26.8s
1095:	learn 0.8921745712	test 0.7166371213	bestTest 0.716721268		total: 32.5s	remaining: 26.8s
1096:	learn 0.892223775	test 0.716640412	bestTest 0.716721268		total: 32.5s	remaining: 26.8s
1097:	learn 0.8922928297	test 0.7166145568	bestTest 0.716721268		total: 32.5s	remaining: 26.7s
1098:	learn 0.892317457	test 0.7165412223	bestTest 0.716721268		total: 32.6s	remaining: 26.7s
1099:	learn 0.8924578699	test 0.7166893017	bestTest 0.716721268		total: 32.6s	remaining: 26.7s
1100:	learn 0.8925646785	test 0.7167066951	bestTest 0.716721268		total: 32.6s	remaining: 26.6s
1101:	learn 0.8926342752	test 0.7166991736	bestTest 0.716721268		total: 32.6s	remaining: 26.6s
1102:	learn 0.8926877981	test 0.7168087054	bestTest 0.7

1182:	learn 0.8976933228	test 0.7165271195	bestTest 0.7168679371		total: 35s	remaining: 24.2s
1183:	learn 0.8977189494	test 0.7164500241	bestTest 0.7168679371		total: 35s	remaining: 24.2s
1184:	learn 0.89771336	test 0.7163780999	bestTest 0.7168679371		total: 35.1s	remaining: 24.1s
1185:	learn 0.8978451178	test 0.7161106167	bestTest 0.7168679371		total: 35.1s	remaining: 24.1s
1186:	learn 0.8978591591	test 0.7160838214	bestTest 0.7168679371		total: 35.1s	remaining: 24.1s
1187:	learn 0.8980564485	test 0.7160589065	bestTest 0.7168679371		total: 35.2s	remaining: 24s
1188:	learn 0.8980866652	test 0.7160880522	bestTest 0.7168679371		total: 35.2s	remaining: 24s
1189:	learn 0.898098115	test 0.7162948933	bestTest 0.7168679371		total: 35.2s	remaining: 24s
1190:	learn 0.8981253507	test 0.7162441232	bestTest 0.7168679371		total: 35.2s	remaining: 23.9s
1191:	learn 0.8982276709	test 0.7160857018	bestTest 0.7168679371		total: 35.3s	remaining: 23.9s
1192:	learn 0.8982342427	test 0.7160490345	bestTest 0

1273:	learn 0.903152166	test 0.7162206186	bestTest 0.7173037137		total: 37.7s	remaining: 21.5s
1274:	learn 0.9031952384	test 0.7162018148	bestTest 0.7173037137		total: 37.7s	remaining: 21.4s
1275:	learn 0.9032525553	test 0.7162154476	bestTest 0.7173037137		total: 37.7s	remaining: 21.4s
1276:	learn 0.9033219996	test 0.7162474139	bestTest 0.7173037137		total: 37.8s	remaining: 21.4s
1277:	learn 0.903324337	test 0.7162638672	bestTest 0.7173037137		total: 37.8s	remaining: 21.3s
1278:	learn 0.9033912236	test 0.7164805802	bestTest 0.7173037137		total: 37.8s	remaining: 21.3s
1279:	learn 0.9034154783	test 0.7164246391	bestTest 0.7173037137		total: 37.8s	remaining: 21.3s
1280:	learn 0.903466884	test 0.7163390821	bestTest 0.7173037137		total: 37.9s	remaining: 21.3s
1281:	learn 0.9035190519	test 0.7161698485	bestTest 0.7173037137		total: 37.9s	remaining: 21.2s
1282:	learn 0.9035989805	test 0.7161552756	bestTest 0.7173037137		total: 37.9s	remaining: 21.2s
1283:	learn 0.9036706267	test 0.7161190784	

1364:	learn 0.9078822511	test 0.7162963036	bestTest 0.7173037137		total: 40.3s	remaining: 18.8s
1365:	learn 0.907930625	test 0.7163000644	bestTest 0.7173037137		total: 40.3s	remaining: 18.7s
1366:	learn 0.9080123829	test 0.7163804503	bestTest 0.7173037137		total: 40.4s	remaining: 18.7s
1367:	learn 0.9080757296	test 0.7163936129	bestTest 0.7173037137		total: 40.4s	remaining: 18.7s
1368:	learn 0.9081268473	test 0.7163691681	bestTest 0.7173037137		total: 40.4s	remaining: 18.6s
1369:	learn 0.908212467	test 0.7164810503	bestTest 0.7173037137		total: 40.5s	remaining: 18.6s
1370:	learn 0.9082704614	test 0.7165073755	bestTest 0.7173037137		total: 40.5s	remaining: 18.6s
1371:	learn 0.9083587065	test 0.7165153671	bestTest 0.7173037137		total: 40.5s	remaining: 18.5s
1372:	learn 0.9083651428	test 0.7165163073	bestTest 0.7173037137		total: 40.5s	remaining: 18.5s
1373:	learn 0.9083929713	test 0.7164819905	bestTest 0.7173037137		total: 40.6s	remaining: 18.5s
1374:	learn 0.9084599087	test 0.7164829307

1455:	learn 0.912537997	test 0.7163517746	bestTest 0.7173037137		total: 43s	remaining: 16.1s
1456:	learn 0.9125526142	test 0.7163320307	bestTest 0.7173037137		total: 43s	remaining: 16s
1457:	learn 0.9126298667	test 0.7162624569	bestTest 0.7173037137		total: 43.1s	remaining: 16s
1458:	learn 0.9126571363	test 0.7162652774	bestTest 0.7173037137		total: 43.1s	remaining: 16s
1459:	learn 0.9126828984	test 0.7162384821	bestTest 0.7173037137		total: 43.1s	remaining: 15.9s
1460:	learn 0.9127615059	test 0.7162901924	bestTest 0.7173037137		total: 43.1s	remaining: 15.9s
1461:	learn 0.9128245308	test 0.7163997241	bestTest 0.7173037137		total: 43.2s	remaining: 15.9s
1462:	learn 0.9128826099	test 0.7163108765	bestTest 0.7173037137		total: 43.2s	remaining: 15.9s
1463:	learn 0.9128975658	test 0.7162704485	bestTest 0.7173037137		total: 43.2s	remaining: 15.8s
1464:	learn 0.9129266308	test 0.7163395522	bestTest 0.7173037137		total: 43.3s	remaining: 15.8s
1465:	learn 0.9129844727	test 0.7162591662	bestTest

1545:	learn 0.9166558718	test 0.7163080559	bestTest 0.7173037137		total: 45.6s	remaining: 13.4s
1546:	learn 0.9167124942	test 0.7163207485	bestTest 0.7173037137		total: 45.7s	remaining: 13.4s
1547:	learn 0.9167786865	test 0.7163945531	bestTest 0.7173037137		total: 45.7s	remaining: 13.3s
1548:	learn 0.9168136118	test 0.7164114765	bestTest 0.7173037137		total: 45.7s	remaining: 13.3s
1549:	learn 0.9168652208	test 0.7164349812	bestTest 0.7173037137		total: 45.8s	remaining: 13.3s
1550:	learn 0.9169105797	test 0.7163395522	bestTest 0.7173037137		total: 45.8s	remaining: 13.3s
1551:	learn 0.916928415	test 0.7162803204	bestTest 0.7173037137		total: 45.8s	remaining: 13.2s
1552:	learn 0.9169967075	test 0.7165741288	bestTest 0.7173037137		total: 45.8s	remaining: 13.2s
1553:	learn 0.9170695393	test 0.716449554	bestTest 0.7173037137		total: 45.9s	remaining: 13.2s
1554:	learn 0.9171237228	test 0.7164721185	bestTest 0.7173037137		total: 45.9s	remaining: 13.1s
1555:	learn 0.9171706061	test 0.7165163073

1636:	learn 0.9210246096	test 0.7166197278	bestTest 0.7173037137		total: 48.3s	remaining: 10.7s
1637:	learn 0.9210453921	test 0.7165919923	bestTest 0.7173037137		total: 48.4s	remaining: 10.7s
1638:	learn 0.9210732206	test 0.7165778895	bestTest 0.7173037137		total: 48.4s	remaining: 10.7s
1639:	learn 0.9210999312	test 0.7165708381	bestTest 0.7173037137		total: 48.4s	remaining: 10.6s
1640:	learn 0.9211446296	test 0.7165355811	bestTest 0.7173037137		total: 48.5s	remaining: 10.6s
1641:	learn 0.9211734236	test 0.7165412223	bestTest 0.7173037137		total: 48.5s	remaining: 10.6s
1642:	learn 0.9212233726	test 0.7165515643	bestTest 0.7173037137		total: 48.5s	remaining: 10.5s
1643:	learn 0.9212753711	test 0.716489512	bestTest 0.7173037137		total: 48.5s	remaining: 10.5s
1644:	learn 0.9213377524	test 0.7164335709	bestTest 0.7173037137		total: 48.6s	remaining: 10.5s
1645:	learn 0.9213613465	test 0.716499854	bestTest 0.7173037137		total: 48.6s	remaining: 10.5s
1646:	learn 0.9213557909	test 0.7164810503

1727:	learn 0.9250509365	test 0.7147149095	bestTest 0.7173037137		total: 51s	remaining: 8.03s
1728:	learn 0.9250949405	test 0.7147003366	bestTest 0.7173037137		total: 51.1s	remaining: 8s
1729:	learn 0.9251394187	test 0.7147478161	bestTest 0.7173037137		total: 51.1s	remaining: 7.97s
1730:	learn 0.9252489881	test 0.7147882441	bestTest 0.7173037137		total: 51.1s	remaining: 7.94s
1731:	learn 0.9252801872	test 0.7147670899	bestTest 0.7173037137		total: 51.1s	remaining: 7.91s
1732:	learn 0.9252996316	test 0.7148564076	bestTest 0.7173037137		total: 51.2s	remaining: 7.88s
1733:	learn 0.9253088287	test 0.7148385441	bestTest 0.7173037137		total: 51.2s	remaining: 7.85s
1734:	learn 0.9253338456	test 0.7147243114	bestTest 0.7173037137		total: 51.2s	remaining: 7.82s
1735:	learn 0.9253685508	test 0.714823031	bestTest 0.7173037137		total: 51.3s	remaining: 7.79s
1736:	learn 0.9254334388	test 0.714692345	bestTest 0.7173037137		total: 51.3s	remaining: 7.76s
1737:	learn 0.9254614706	test 0.7146777722	best

1817:	learn 0.9284501987	test 0.713695277	bestTest 0.7173037137		total: 53.7s	remaining: 5.37s
1818:	learn 0.9285268246	test 0.713725363	bestTest 0.7173037137		total: 53.7s	remaining: 5.34s
1819:	learn 0.9285737587	test 0.7137855349	bestTest 0.7173037137		total: 53.7s	remaining: 5.31s
1820:	learn 0.9286174408	test 0.7138207919	bestTest 0.7173037137		total: 53.8s	remaining: 5.28s
1821:	learn 0.9286506555	test 0.7137446368	bestTest 0.7173037137		total: 53.8s	remaining: 5.25s
1822:	learn 0.9286819224	test 0.7137596798	bestTest 0.7173037137		total: 53.8s	remaining: 5.23s
1823:	learn 0.9286944731	test 0.7137625004	bestTest 0.7173037137		total: 53.9s	remaining: 5.2s
1824:	learn 0.9287810075	test 0.7137775434	bestTest 0.7173037137		total: 53.9s	remaining: 5.17s
1825:	learn 0.9287985548	test 0.7137925863	bestTest 0.7173037137		total: 53.9s	remaining: 5.14s
1826:	learn 0.9288320574	test 0.7138494676	bestTest 0.7173037137		total: 53.9s	remaining: 5.11s
1827:	learn 0.9288307024	test 0.7138071592	

1908:	learn 0.9318661954	test 0.7123602119	bestTest 0.7173037137		total: 56.4s	remaining: 2.69s
1909:	learn 0.9318805754	test 0.7123644427	bestTest 0.7173037137		total: 56.4s	remaining: 2.66s
1910:	learn 0.9319015442	test 0.7123475193	bestTest 0.7173037137		total: 56.4s	remaining: 2.63s
1911:	learn 0.9319405176	test 0.7124044006	bestTest 0.7173037137		total: 56.5s	remaining: 2.6s
1912:	learn 0.9319891456	test 0.7122798259	bestTest 0.7173037137		total: 56.5s	remaining: 2.57s
1913:	learn 0.9320371807	test 0.7123465792	bestTest 0.7173037137		total: 56.5s	remaining: 2.54s
1914:	learn 0.9320606393	test 0.7123094418	bestTest 0.7173037137		total: 56.5s	remaining: 2.51s
1915:	learn 0.9320783391	test 0.712360682	bestTest 0.7173037137		total: 56.6s	remaining: 2.48s
1916:	learn 0.9320872144	test 0.712386067	bestTest 0.7173037137		total: 56.6s	remaining: 2.45s
1917:	learn 0.9320985287	test 0.712391238	bestTest 0.7173037137		total: 56.6s	remaining: 2.42s
1918:	learn 0.9321370449	test 0.7123592717	b

1999:	learn 0.9344536204	test 0.7121199942	bestTest 0.7173037137		total: 59s	remaining: 0us

bestTest = 0.7173037137
bestIteration = 1221

Shrink model to first 1222 iterations.


In [None]:
tree_count = model.tree_count_
print(tree_count)

In [None]:
def stas_xgb(X_train, y_train, X_val, y_val=None):
    ytestxgb = np.zeros(X_val.shape[0])
    bgs = 10
    for bg in tqdm(range(bgs)):
        seed = bg + 1

        model = CatBoostClassifier(iterations=tree_count,
                                   depth=6,
                                   loss_function='Logloss',
                                   learning_rate=0.015,
                                   thread_count=12,
                                   eval_metric='AUC',
                                   random_seed=seed) \
                    .fit(X_train, y_train)
        
        ypredxgb = model.predict_proba(X_val)[:, 1]
        ytestxgb += ypredxgb
        
        if y_val is not None:
            print(bg, roc_auc_score(y_val, ytestxgb / (bg + 1.)), roc_auc_score(y_val, ypredxgb))
        else:
            print(bg)
    
    ytestxgb /= bgs
    return ytestxgb    

In [None]:
stas_xgb(X_train, y_train, X_val, y_val)

In [None]:
model.predict

In [None]:
plot_roc_auc(model, dm.X_train, dm.X_val, dm.y_train, dm.y_val)

In [None]:
plot_feature_importances(model, dm.X_train)

## Final model and uploading the results

In [None]:
y_pred = stas_xgb(X, y, X_test)

In [None]:
X_test, test_block_ids = dm.X_test, dm.test_block_ids

In [None]:
#This code saves the prediction for one city.
prediction_for_one_city = test_block_ids.copy()
prediction_for_one_city["prediction"] = model.predict_proba(X_test)[:,1]
prediction_for_one_city.to_csv(CITY_PREDICTIONS_PATH)

prediction_for_one_city.head()

#WARNING! you must run this notebook for all three regions before proceeding!
#We assume that you have prediction_msk.csv , prediction_spb.csv and prediction_kazan.csv files prepared.

In [None]:
data = X.copy()
data["target"] = y
data.to_csv("intermediate_data/spb.csv")
X_test.to_csv("intermediate_data/spb_test.csv")

In [None]:
predictions = pd.concat(
    [pd.read_csv(fname,index_col=0) for fname in ("./intermediate_data/prediction_kazan.csv",
                                                  "./intermediate_data/prediction_spb.csv",
                                                  "./intermediate_data/prediction_msk.csv")],
    ignore_index=True
)
blocks = pd.read_csv("./data/hackathon_tosubmit.tsv",sep='\t')
assert len(predictions) == len(blocks),"Predictions don't match blocks. Sumbit at your own risk."

merged = pd.merge(blocks,predictions,how='left',on=["sq_x","sq_y","hour_hash"])
assert not np.isnan(merged.prediction).any(), "some predictions are missing. Sumbit at your own risk."

In [None]:
merged[['id','prediction']].to_csv("baseline_submission.csv",sep=',',index=False,header=False)

In [None]:
!head baseline_submission.csv