In [1]:
import os
import json
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm

from data_manager import DataManager
from utils import *

In [2]:
def stas_xgb(X_train, y_train, X_val, y_val=None):
    ytestxgb = np.zeros(X_val.shape[0])
    bgs = 10
    for bg in tqdm(range(bgs)):
        seed = bg + 1

        model = lgb.LGBMClassifier(
                learning_rate=0.05,
                max_depth=15 + np.random.randint(0,10),
                num_leaves=20 + np.random.randint(0,10),
                n_estimators=1000,
                objective='binary',
                n_jobs=12
            )
        model.fit(X_train, y_train, eval_set=(X_val, y_val), eval_metric='auc', early_stopping_rounds=100, verbose=False)
        
        ypredxgb = model.predict_proba(X_val)[:, 1]
        ytestxgb += ypredxgb
        
        if y_val is not None:
            print(bg, roc_auc_score(y_val, ytestxgb / (bg + 1.)), roc_auc_score(y_val, ypredxgb))
    
    ytestxgb /= bgs
    return ytestxgb  

In [3]:
FEATURE_GENERATION = '5gen'

In [4]:
def prediction_for_one_city():
    dm = load_dm(FEATURE_GENERATION, CITY)
    X, X_test, y, block_ids, test_block_ids = dm.X_train, dm.X_test, dm.y_train, dm.train_block_ids, dm.test_block_ids
    in_train = block_ids['hours_since'] <= np.percentile(block_ids['hours_since'], 85)  #leave last 15% for validation
    X_train, y_train = X[in_train], y[in_train]
    X_val, y_val = X[~in_train], y[~in_train]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    param = {'metric': 'auc', 'num_trees': 1000, 'objective': 'binary', 'learning_rate': 0.02}
    bst = lgb.train(param, train_data, valid_sets=val_data, early_stopping_rounds=100)
    
    train_data = lgb.Dataset(X, label=y)
    param = {'metric': 'auc', 'num_trees': bst.best_iteration, 'objective': 'binary', 'learning_rate': 0.02}
    bst = lgb.train(param, train_data)
    
    y_pred = bst.predict(X_test)
    
    prediction_for_one_city = test_block_ids.copy()
    prediction_for_one_city["prediction"] = y_pred
    prediction_for_one_city.to_csv("./intermediate_data/lightgbm_{}.csv".format(CITY))
    prediction_for_one_city.head()

In [5]:
CITY = 'msk'
prediction_for_one_city()



[1]	valid_0's auc: 0.513457
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.513622
[3]	valid_0's auc: 0.51497
[4]	valid_0's auc: 0.514719
[5]	valid_0's auc: 0.515842
[6]	valid_0's auc: 0.515445
[7]	valid_0's auc: 0.515544
[8]	valid_0's auc: 0.516275
[9]	valid_0's auc: 0.515625
[10]	valid_0's auc: 0.5155
[11]	valid_0's auc: 0.51571
[12]	valid_0's auc: 0.516276
[13]	valid_0's auc: 0.515794
[14]	valid_0's auc: 0.530603
[15]	valid_0's auc: 0.529786
[16]	valid_0's auc: 0.623317
[17]	valid_0's auc: 0.623409
[18]	valid_0's auc: 0.6143
[19]	valid_0's auc: 0.613254
[20]	valid_0's auc: 0.615644
[21]	valid_0's auc: 0.615969
[22]	valid_0's auc: 0.619206
[23]	valid_0's auc: 0.676299
[24]	valid_0's auc: 0.698415
[25]	valid_0's auc: 0.690419
[26]	valid_0's auc: 0.705787
[27]	valid_0's auc: 0.722677
[28]	valid_0's auc: 0.729835
[29]	valid_0's auc: 0.728376
[30]	valid_0's auc: 0.729615
[31]	valid_0's auc: 0.742824
[32]	valid_0's auc: 0.751069
[33]	valid_0's auc: 0.74

KeyboardInterrupt: 

In [None]:
CITY = 'spb'
prediction_for_one_city()

In [None]:
CITY = 'kazan'
prediction_for_one_city()

In [None]:
predictions = pd.concat(
    [pd.read_csv(fname,index_col=0) for fname in ("./intermediate_data/lightgbm_kazan.csv",
                                                  "./intermediate_data/lightgbm_spb.csv",
                                                  "./intermediate_data/lightgbm_msk.csv")],
    ignore_index=True
)
blocks = pd.read_csv("./data/raw/data/hackathon_tosubmit.tsv",sep='\t')
assert len(predictions) == len(blocks),"Predictions don't match blocks. Sumbit at your own risk."

merged = pd.merge(blocks,predictions,how='left',on=["sq_x","sq_y","hour_hash"])
assert not np.isnan(merged.prediction).any(), "some predictions are missing. Sumbit at your own risk."

In [None]:
merged[['id','prediction']].to_csv("lightgbm_submission.csv",sep=',',index=False,header=False)

In [None]:
!head lightgbm_submission.csv