In [1]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score,roc_curve

from catboost import CatBoostClassifier

from data_manager import DataManager
from tqdm import tqdm_notebook as tqdm

In [None]:
def plot_roc_auc(model, X_train, X_val, y_train, y_val):
    y_train_pred = model.predict_proba(X_train)[:,1]
    print("Train ROC AUC:",roc_auc_score(y_train,y_train_pred))

    fpr,tpr,_ = roc_curve(y_train, y_train_pred)
    plt.plot(fpr,tpr,label='train AUC')

    y_val_pred = model.predict_proba(X_val)[:,1]
    print("Val ROC AUC:",roc_auc_score(y_val,y_val_pred))

    fpr,tpr,_ = roc_curve(y_val, y_val_pred)
    plt.plot(fpr,tpr,label='validation AUC')

    plt.plot([0,1],[0,1])
    plt.legend(loc='lower right')
    
def plot_feature_importances(model, X):
    order = np.argsort(model._feature_importance)
    plt.figure(figsize=[6,9])
    plt.plot(np.array(model._feature_importance)[order],range(len(order)),marker='o')
    plt.hlines(range(len(order)),np.zeros_like(order),np.array(model._feature_importance)[order],linestyles=':')
    plt.yticks(range(X.shape[1]),X.columns[order]);
    plt.tick_params(labelsize=16)
    plt.xlim([0.1,max(model._feature_importance)*1.5])
    plt.ylim(-1,len(order))
    plt.xscale('log') 

In [None]:
dm = DataManager(city_name='kazan')

Loading train df...
Loading train netatmo df...
Preprocessing train netatmo df...
Extracting features...


  interpolation=interpolation)
 51%|█████     | 16725/32803 [04:03<03:49, 69.93it/s]

In [None]:
X, y, block_ids = dm.X_train, dm.y_train, dm.train_block_ids

In [None]:
in_train = block_ids['hours_since'] <= np.percentile(block_ids['hours_since'], 85)  #leave last 15% for validation
X_train, y_train = X[in_train], y[in_train]
X_val, y_val = X[~in_train], y[~in_train]

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
model = CatBoostClassifier(iterations=2000,
                           depth=6,
                           loss_function='Logloss',
                           learning_rate=0.015,
                           thread_count=12,
                           use_best_model=True,
                           eval_metric='AUC',
                           random_seed=1,
                           verbose=True) \
            .fit(X_train, y_train, eval_set=(X_val, y_val))

In [None]:
tree_count = model.tree_count_
print(tree_count)

In [None]:
def stas_xgb(X_train, y_train, X_val, y_val=None):
    ytestxgb = np.zeros(X_val.shape[0])
    bgs = 10
    for bg in tqdm(range(bgs)):
        seed = bg + 1

        model = CatBoostClassifier(iterations=tree_count,
                                   depth=6,
                                   loss_function='Logloss',
                                   learning_rate=0.015,
                                   thread_count=12,
                                   eval_metric='AUC',
                                   random_seed=seed) \
                    .fit(X_train, y_train)
        
        ypredxgb = model.predict_proba(X_val)[:, 1]
        ytestxgb += ypredxgb
        
        if y_val is not None:
            print(bg, roc_auc_score(y_val, ytestxgb / (bg + 1.)), roc_auc_score(y_val, ypredxgb))
        else:
            print(bg)
    
    ytestxgb /= bgs
    return ytestxgb    

In [None]:
stas_xgb(X_train, y_train, X_val, y_val)

In [None]:
model.predict

In [None]:
plot_roc_auc(model, dm.X_train, dm.X_val, dm.y_train, dm.y_val)

In [None]:
plot_feature_importances(model, dm.X_train)

## Final model and uploading the results

In [None]:
y_pred = stas_xgb(X, y, X_test)

In [None]:
#This code saves the prediction for one city.
prediction_for_one_city = test_block_ids.copy()
prediction_for_one_city["prediction"] = model.predict_proba(X_test)[:,1]
prediction_for_one_city.to_csv(CITY_PREDICTIONS_PATH)

prediction_for_one_city.head()

#WARNING! you must run this notebook for all three regions before proceeding!
#We assume that you have prediction_msk.csv , prediction_spb.csv and prediction_kazan.csv files prepared.

In [None]:
data = X.copy()
data["target"] = y
data.to_csv("intermediate_data/kazan.csv")
X_test.to_csv("intermediate_data/kazan_test.csv")