use regression

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import seaborn as sns
%matplotlib inline

## 1. Preprocessing

In [2]:
data = pd.read_csv("data_train.csv")

In [3]:
data['time_entry'] = pd.to_datetime(data['time_entry'], format='%H:%M:%S')
data['time_exit']  = pd.to_datetime(data['time_exit'], format='%H:%M:%S')

In [4]:
data = data[data['time_exit'].dt.hour.between(15,16)]

In [5]:
data[:5]

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit
5,5,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_5,1900-01-01 15:02:31,1900-01-01 15:18:33,,,,3744945.0,-19281830.0,3744785.0,-19281480.0
9,9,0000cf177130469eeac79f67b6bcf3df_9,traj_0000cf177130469eeac79f67b6bcf3df_9_3,1900-01-01 15:00:32,1900-01-01 15:29:48,1.149404,1.149404,1.149404,3749088.0,-19266050.0,3749610.0,-19265940.0
11,11,0001f97b99a80f18f62e2d44e54ef33d_3,traj_0001f97b99a80f18f62e2d44e54ef33d_3_1,1900-01-01 14:34:35,1900-01-01 15:19:51,30.167742,30.167742,30.167742,3758738.0,-19375940.0,3769687.0,-19142580.0
20,20,0002124248b0ca510dea42824723ccac_31,traj_0002124248b0ca510dea42824723ccac_31_10,1900-01-01 15:28:54,1900-01-01 15:28:54,,,,3767866.0,-19177970.0,3767866.0,-19177970.0
28,28,000219c2a6380c307e8bffd85b5e404b_23,traj_000219c2a6380c307e8bffd85b5e404b_23_16,1900-01-01 15:08:05,1900-01-01 15:08:05,,,,3747641.0,-19226950.0,3747641.0,-19226950.0


## 1.2 Time Features

In [6]:
data['duration'] = (data['time_exit'] - data['time_entry']).dt.total_seconds().astype(int)

In [7]:
data['entry_hour'] = data['time_entry'].dt.hour
data['entry_minute'] = data['time_entry'].dt.minute
data['entry_second'] = data['time_entry'].dt.second
data['exit_hour'] = data['time_exit'].dt.hour
data['exit_minute'] = data['time_exit'].dt.minute
data['exit_second'] = data['time_exit'].dt.second

## 1.4 Train/Test Split

In [65]:
Y = data['x_exit'].between(3750901.5068, 3770901.5068) & data['y_exit'].between((-19268905.6133), -19208905.6133)

In [8]:
Y1 = data['x_exit']
Y2 = data['y_exit']

In [9]:
X = data.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'
              ,'vmax', 'vmin', 'vmean'], axis=1)


In [10]:
X[:5]

Unnamed: 0,x_entry,y_entry,duration,entry_hour,entry_minute,entry_second,exit_hour,exit_minute,exit_second
5,3744945.0,-19281830.0,962,15,2,31,15,18,33
9,3749088.0,-19266050.0,1756,15,0,32,15,29,48
11,3758738.0,-19375940.0,2716,14,34,35,15,19,51
20,3767866.0,-19177970.0,0,15,28,54,15,28,54
28,3747641.0,-19226950.0,0,15,8,5,15,8,5


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_x, test_x, train_y1, test_y1 = train_test_split(X, Y1, test_size=0.2, random_state=42)

In [41]:
train_y2 = Y2[train_y1.index]
test_y2 = Y2[test_y1.index]

In [66]:
train_y = Y[train_y1.index]
test_y = Y[test_y1.index]

In [42]:
(train_x.shape, train_y1.shape, train_y2.shape, test_y1.shape, test_y2.shape)

((107250, 9), (107250,), (107250,), (26813,), (26813,))

In [68]:
(train_y.shape, test_y.shape)

((107250,), (26813,))

# Model Building

In [50]:
# 379 rounds (3618.77rmse)
models1 = xgb.XGBRegressor(max_depth=10, learning_rate=0.03, n_estimators=20000, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
#  rounds (rmse)
# models1 = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=20000,
#                            subsample=0.8, colsample_bytree=0.9, reg_alpha=65, seed=420, n_jobs=-1)

In [51]:
models1.fit(train_x, train_y1, early_stopping_rounds=100,
           eval_set=[(train_x, train_y1), (test_x, test_y1)])


[0]	validation_0-rmse:3.64766e+06	validation_1-rmse:3.64759e+06
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:3.53823e+06	validation_1-rmse:3.53816e+06
[2]	validation_0-rmse:3.43208e+06	validation_1-rmse:3.43202e+06
[3]	validation_0-rmse:3.32912e+06	validation_1-rmse:3.32906e+06
[4]	validation_0-rmse:3.22925e+06	validation_1-rmse:3.22919e+06
[5]	validation_0-rmse:3.13238e+06	validation_1-rmse:3.13231e+06
[6]	validation_0-rmse:3.03841e+06	validation_1-rmse:3.03834e+06
[7]	validation_0-rmse:2.94726e+06	validation_1-rmse:2.94719e+06
[8]	validation_0-rmse:2.85884e+06	validation_1-rmse:2.85877e+06
[9]	validation_0-rmse:2.77308e+06	validation_1-rmse:2.77301e+06
[10]	validation_0-rmse:2.68989e+06	validation_1-rmse:2.68982e+06
[11]	validation_0-rmse:2.60919e+06	validation_1-rmse:2.60913e+06
[12]	validation_0-rmse:2.53092e+06	validation_1-rmse:2.53085e+06
[13]	

[138]	validation_0-rmse:54669.8	validation_1-rmse:54648
[139]	validation_0-rmse:53037.9	validation_1-rmse:53016.2
[140]	validation_0-rmse:51454.7	validation_1-rmse:51433.1
[141]	validation_0-rmse:49919.5	validation_1-rmse:49897.8
[142]	validation_0-rmse:48430.6	validation_1-rmse:48409
[143]	validation_0-rmse:46986.6	validation_1-rmse:46965
[144]	validation_0-rmse:45586.2	validation_1-rmse:45564.6
[145]	validation_0-rmse:44228.5	validation_1-rmse:44206.9
[146]	validation_0-rmse:42911.4	validation_1-rmse:42889.8
[147]	validation_0-rmse:41634.2	validation_1-rmse:41612.6
[148]	validation_0-rmse:40395.5	validation_1-rmse:40374
[149]	validation_0-rmse:39194.6	validation_1-rmse:39173.1
[150]	validation_0-rmse:38029.7	validation_1-rmse:38008
[151]	validation_0-rmse:36900.5	validation_1-rmse:36879
[152]	validation_0-rmse:35805.2	validation_1-rmse:35783.7
[153]	validation_0-rmse:34742.9	validation_1-rmse:34721.4
[154]	validation_0-rmse:33713.6	validation_1-rmse:33691.9
[155]	validation_0-rmse:32

[281]	validation_0-rmse:3216.15	validation_1-rmse:3684.56
[282]	validation_0-rmse:3208.01	validation_1-rmse:3680.47
[283]	validation_0-rmse:3200.6	validation_1-rmse:3676.92
[284]	validation_0-rmse:3189.9	validation_1-rmse:3673.59
[285]	validation_0-rmse:3183.96	validation_1-rmse:3670.08
[286]	validation_0-rmse:3177.69	validation_1-rmse:3667.02
[287]	validation_0-rmse:3170.26	validation_1-rmse:3664.43
[288]	validation_0-rmse:3165.77	validation_1-rmse:3661.59
[289]	validation_0-rmse:3160.96	validation_1-rmse:3659.02
[290]	validation_0-rmse:3156.34	validation_1-rmse:3656.63
[291]	validation_0-rmse:3150.76	validation_1-rmse:3654.4
[292]	validation_0-rmse:3143.02	validation_1-rmse:3651.58
[293]	validation_0-rmse:3136.94	validation_1-rmse:3649.31
[294]	validation_0-rmse:3133.76	validation_1-rmse:3647.26
[295]	validation_0-rmse:3130.41	validation_1-rmse:3645.54
[296]	validation_0-rmse:3123.9	validation_1-rmse:3643.89
[297]	validation_0-rmse:3120.15	validation_1-rmse:3642.58
[298]	validation_0

[423]	validation_0-rmse:2838.92	validation_1-rmse:3622.57
[424]	validation_0-rmse:2836.63	validation_1-rmse:3623.2
[425]	validation_0-rmse:2834.82	validation_1-rmse:3623.52
[426]	validation_0-rmse:2832.77	validation_1-rmse:3623.34
[427]	validation_0-rmse:2830.06	validation_1-rmse:3623.44
[428]	validation_0-rmse:2828.09	validation_1-rmse:3623.54
[429]	validation_0-rmse:2827.38	validation_1-rmse:3623.76
[430]	validation_0-rmse:2825.86	validation_1-rmse:3623.72
[431]	validation_0-rmse:2823.56	validation_1-rmse:3623.45
[432]	validation_0-rmse:2819.77	validation_1-rmse:3623.19
[433]	validation_0-rmse:2817.05	validation_1-rmse:3623.24
[434]	validation_0-rmse:2815.23	validation_1-rmse:3623.66
[435]	validation_0-rmse:2814.98	validation_1-rmse:3623.68
[436]	validation_0-rmse:2813.52	validation_1-rmse:3623.73
[437]	validation_0-rmse:2810.46	validation_1-rmse:3624.24
[438]	validation_0-rmse:2807.75	validation_1-rmse:3624.03
[439]	validation_0-rmse:2806.04	validation_1-rmse:3623.99
[440]	validatio

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, importance_type='gain',
       learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=20000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.8)

In [28]:
sorted(zip(train_x.columns.values, models1.feature_importances_), key=lambda x: x[1], reverse=True)

[('x_entry', 0.6336658),
 ('duration', 0.098733544),
 ('y_entry', 0.050405383),
 ('exit_minute', 0.04401304),
 ('entry_hour', 0.043865886),
 ('exit_second', 0.042689987),
 ('entry_second', 0.03996288),
 ('entry_minute', 0.039428256),
 ('exit_hour', 0.0072352383)]

In [56]:
#  356 rounds (35361.8 rmse)
models2 = xgb.XGBRegressor(max_depth=10, learning_rate=0.03, n_estimators=20000, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
#  227 rounds (35389.1 rmse)
# models2 = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=20000,
#                            subsample=0.8, colsample_bytree=0.9, reg_alpha=65, seed=420, n_jobs=-1)

In [57]:
models2.fit(train_x, train_y2, early_stopping_rounds=100,
           eval_set=[(train_x, train_y2), (test_x, test_y2)])


[0]	validation_0-rmse:1.8645e+07	validation_1-rmse:1.86453e+07
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:1.80856e+07	validation_1-rmse:1.8086e+07
[2]	validation_0-rmse:1.75431e+07	validation_1-rmse:1.75434e+07
[3]	validation_0-rmse:1.70168e+07	validation_1-rmse:1.70171e+07
[4]	validation_0-rmse:1.65063e+07	validation_1-rmse:1.65066e+07
[5]	validation_0-rmse:1.60112e+07	validation_1-rmse:1.60115e+07
[6]	validation_0-rmse:1.55308e+07	validation_1-rmse:1.55312e+07
[7]	validation_0-rmse:1.50649e+07	validation_1-rmse:1.50652e+07
[8]	validation_0-rmse:1.4613e+07	validation_1-rmse:1.46133e+07
[9]	validation_0-rmse:1.41746e+07	validation_1-rmse:1.41749e+07
[10]	validation_0-rmse:1.37494e+07	validation_1-rmse:1.37497e+07
[11]	validation_0-rmse:1.33369e+07	validation_1-rmse:1.33373e+07
[12]	validation_0-rmse:1.29369e+07	validation_1-rmse:1.29372e+07
[13]	val

[129]	validation_0-rmse:368480	validation_1-rmse:368837
[130]	validation_0-rmse:357536	validation_1-rmse:357893
[131]	validation_0-rmse:346924	validation_1-rmse:347281
[132]	validation_0-rmse:336632	validation_1-rmse:336989
[133]	validation_0-rmse:326657	validation_1-rmse:327012
[134]	validation_0-rmse:316984	validation_1-rmse:317338
[135]	validation_0-rmse:307605	validation_1-rmse:307958
[136]	validation_0-rmse:298505	validation_1-rmse:298857
[137]	validation_0-rmse:289683	validation_1-rmse:290033
[138]	validation_0-rmse:281128	validation_1-rmse:281483
[139]	validation_0-rmse:272835	validation_1-rmse:273192
[140]	validation_0-rmse:264795	validation_1-rmse:265155
[141]	validation_0-rmse:257000	validation_1-rmse:257365
[142]	validation_0-rmse:249446	validation_1-rmse:249814
[143]	validation_0-rmse:242119	validation_1-rmse:242492
[144]	validation_0-rmse:235020	validation_1-rmse:235393
[145]	validation_0-rmse:228133	validation_1-rmse:228511
[146]	validation_0-rmse:221463	validation_1-rmse

[273]	validation_0-rmse:31548.5	validation_1-rmse:35702.8
[274]	validation_0-rmse:31525.2	validation_1-rmse:35684.4
[275]	validation_0-rmse:31503.1	validation_1-rmse:35666.8
[276]	validation_0-rmse:31465.7	validation_1-rmse:35652.8
[277]	validation_0-rmse:31420	validation_1-rmse:35635.5
[278]	validation_0-rmse:31380.7	validation_1-rmse:35619.8
[279]	validation_0-rmse:31353.4	validation_1-rmse:35603.1
[280]	validation_0-rmse:31299.5	validation_1-rmse:35586.9
[281]	validation_0-rmse:31264.7	validation_1-rmse:35577.7
[282]	validation_0-rmse:31230.6	validation_1-rmse:35565.6
[283]	validation_0-rmse:31185.3	validation_1-rmse:35547.5
[284]	validation_0-rmse:31137.1	validation_1-rmse:35532
[285]	validation_0-rmse:31117.9	validation_1-rmse:35521.2
[286]	validation_0-rmse:31095.4	validation_1-rmse:35512.1
[287]	validation_0-rmse:31075.9	validation_1-rmse:35503.1
[288]	validation_0-rmse:31015.7	validation_1-rmse:35503.2
[289]	validation_0-rmse:30996.5	validation_1-rmse:35492.9
[290]	validation_0

[416]	validation_0-rmse:28364.9	validation_1-rmse:35439.6
[417]	validation_0-rmse:28353.3	validation_1-rmse:35439
[418]	validation_0-rmse:28335.1	validation_1-rmse:35440.7
[419]	validation_0-rmse:28312	validation_1-rmse:35445.2
[420]	validation_0-rmse:28296.1	validation_1-rmse:35446.3
[421]	validation_0-rmse:28286.9	validation_1-rmse:35445.8
[422]	validation_0-rmse:28275.8	validation_1-rmse:35446.3
[423]	validation_0-rmse:28245.9	validation_1-rmse:35448.2
[424]	validation_0-rmse:28211.9	validation_1-rmse:35448.9
[425]	validation_0-rmse:28204.1	validation_1-rmse:35451.6
[426]	validation_0-rmse:28189	validation_1-rmse:35451.8
[427]	validation_0-rmse:28166.9	validation_1-rmse:35456.4
[428]	validation_0-rmse:28151.2	validation_1-rmse:35459.5
[429]	validation_0-rmse:28117	validation_1-rmse:35462.8
[430]	validation_0-rmse:28095.2	validation_1-rmse:35468.1
[431]	validation_0-rmse:28065.7	validation_1-rmse:35476.6
[432]	validation_0-rmse:28036.4	validation_1-rmse:35477.2
[433]	validation_0-rms

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, importance_type='gain',
       learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=20000, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.8)

In [58]:
sorted(zip(train_x.columns.values, models2.feature_importances_), key=lambda x: x[1], reverse=True)

[('y_entry', 0.662989),
 ('duration', 0.088250965),
 ('exit_minute', 0.04352817),
 ('exit_second', 0.0426593),
 ('entry_second', 0.041350894),
 ('x_entry', 0.040939607),
 ('entry_minute', 0.040323544),
 ('entry_hour', 0.033521783),
 ('exit_hour', 0.006436655)]

In [73]:
pred_y1 = models1.predict(test_x)
pred_y2 = models2.predict(test_x)

In [80]:
pred_y = (pd.Series(pred_y1).between(3750901.5068, 3770901.5068) 
          & pd.Series(pred_y2).between((-19268905.6133), -19208905.6133)).tolist()
          

In [82]:
(pred_y==test_y).sum()/test_y.count()

0.8959832916868683

In [84]:
models1 = xgb.XGBRegressor(max_depth=10, learning_rate=0.03, n_estimators=379, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
models1.fit(X, Y1, eval_set=[(X, Y1)])

[0]	validation_0-rmse:3.64764e+06
[1]	validation_0-rmse:3.53821e+06
[2]	validation_0-rmse:3.43207e+06
[3]	validation_0-rmse:3.32911e+06
[4]	validation_0-rmse:3.22924e+06
[5]	validation_0-rmse:3.13236e+06
[6]	validation_0-rmse:3.03839e+06
[7]	validation_0-rmse:2.94724e+06
[8]	validation_0-rmse:2.85883e+06
[9]	validation_0-rmse:2.77306e+06
[10]	validation_0-rmse:2.68987e+06
[11]	validation_0-rmse:2.60918e+06
[12]	validation_0-rmse:2.53091e+06
[13]	validation_0-rmse:2.45498e+06
[14]	validation_0-rmse:2.38133e+06
[15]	validation_0-rmse:2.30989e+06
[16]	validation_0-rmse:2.2406e+06
[17]	validation_0-rmse:2.17338e+06
[18]	validation_0-rmse:2.10818e+06
[19]	validation_0-rmse:2.04494e+06
[20]	validation_0-rmse:1.98359e+06
[21]	validation_0-rmse:1.92409e+06
[22]	validation_0-rmse:1.86637e+06
[23]	validation_0-rmse:1.81038e+06
[24]	validation_0-rmse:1.75607e+06
[25]	validation_0-rmse:1.70339e+06
[26]	validation_0-rmse:1.65229e+06
[27]	validation_0-rmse:1.60272e+06
[28]	validation_0-rmse:1.55464e

[257]	validation_0-rmse:3563.63
[258]	validation_0-rmse:3541.14
[259]	validation_0-rmse:3521.57
[260]	validation_0-rmse:3503.6
[261]	validation_0-rmse:3484.05
[262]	validation_0-rmse:3465.4
[263]	validation_0-rmse:3450.46
[264]	validation_0-rmse:3436.67
[265]	validation_0-rmse:3420.11
[266]	validation_0-rmse:3405.04
[267]	validation_0-rmse:3390.09
[268]	validation_0-rmse:3377.77
[269]	validation_0-rmse:3364.32
[270]	validation_0-rmse:3353.93
[271]	validation_0-rmse:3341.4
[272]	validation_0-rmse:3330.75
[273]	validation_0-rmse:3319.64
[274]	validation_0-rmse:3310.69
[275]	validation_0-rmse:3301.61
[276]	validation_0-rmse:3292.55
[277]	validation_0-rmse:3285.58
[278]	validation_0-rmse:3277.28
[279]	validation_0-rmse:3267.63
[280]	validation_0-rmse:3260
[281]	validation_0-rmse:3250.81
[282]	validation_0-rmse:3243.54
[283]	validation_0-rmse:3234.78
[284]	validation_0-rmse:3229.4
[285]	validation_0-rmse:3222.23
[286]	validation_0-rmse:3217.15
[287]	validation_0-rmse:3212.58
[288]	validatio

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, importance_type='gain',
       learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=379, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.8)

In [85]:
models2 = xgb.XGBRegressor(max_depth=10, learning_rate=0.03, n_estimators=356, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
models2.fit(X, Y2, eval_set=[(X, Y2)])

[0]	validation_0-rmse:1.8645e+07
[1]	validation_0-rmse:1.80857e+07
[2]	validation_0-rmse:1.75432e+07
[3]	validation_0-rmse:1.70169e+07
[4]	validation_0-rmse:1.65064e+07
[5]	validation_0-rmse:1.60112e+07
[6]	validation_0-rmse:1.55309e+07
[7]	validation_0-rmse:1.5065e+07
[8]	validation_0-rmse:1.4613e+07
[9]	validation_0-rmse:1.41747e+07
[10]	validation_0-rmse:1.37494e+07
[11]	validation_0-rmse:1.3337e+07
[12]	validation_0-rmse:1.29369e+07
[13]	validation_0-rmse:1.25488e+07
[14]	validation_0-rmse:1.21723e+07
[15]	validation_0-rmse:1.18072e+07
[16]	validation_0-rmse:1.1453e+07
[17]	validation_0-rmse:1.11094e+07
[18]	validation_0-rmse:1.07761e+07
[19]	validation_0-rmse:1.04529e+07
[20]	validation_0-rmse:1.01393e+07
[21]	validation_0-rmse:9.83513e+06
[22]	validation_0-rmse:9.5401e+06
[23]	validation_0-rmse:9.2539e+06
[24]	validation_0-rmse:8.9763e+06
[25]	validation_0-rmse:8.70703e+06
[26]	validation_0-rmse:8.44583e+06
[27]	validation_0-rmse:8.19247e+06
[28]	validation_0-rmse:7.94671e+06
[29

[251]	validation_0-rmse:33078.9
[252]	validation_0-rmse:32968.6
[253]	validation_0-rmse:32884.6
[254]	validation_0-rmse:32805.9
[255]	validation_0-rmse:32713.3
[256]	validation_0-rmse:32650.7
[257]	validation_0-rmse:32589.5
[258]	validation_0-rmse:32517.5
[259]	validation_0-rmse:32450.6
[260]	validation_0-rmse:32378.7
[261]	validation_0-rmse:32318.8
[262]	validation_0-rmse:32247.6
[263]	validation_0-rmse:32196.6
[264]	validation_0-rmse:32142.1
[265]	validation_0-rmse:32102.6
[266]	validation_0-rmse:32037.3
[267]	validation_0-rmse:31986.7
[268]	validation_0-rmse:31940.6
[269]	validation_0-rmse:31901.3
[270]	validation_0-rmse:31860.6
[271]	validation_0-rmse:31818.8
[272]	validation_0-rmse:31784.2
[273]	validation_0-rmse:31752.4
[274]	validation_0-rmse:31727.6
[275]	validation_0-rmse:31684
[276]	validation_0-rmse:31657.2
[277]	validation_0-rmse:31615.6
[278]	validation_0-rmse:31591.1
[279]	validation_0-rmse:31535.8
[280]	validation_0-rmse:31502.7
[281]	validation_0-rmse:31478.3
[282]	vali

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, importance_type='gain',
       learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=356, n_jobs=-1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=420, silent=True,
       subsample=0.8)

# Make Prediction

In [86]:
test = pd.read_csv("data_test.csv")

In [87]:
test['time_entry'] = pd.to_datetime(test['time_entry'], format='%H:%M:%S')
test['time_exit']  = pd.to_datetime(test['time_exit'], format='%H:%M:%S')

In [88]:
test['duration'] = (test['time_exit'] - test['time_entry']).dt.total_seconds().astype(int)

In [89]:
test['entry_hour'] = test['time_entry'].dt.hour
test['entry_minute'] = test['time_entry'].dt.minute
test['entry_second'] = test['time_entry'].dt.second
test['exit_hour'] = test['time_exit'].dt.hour
test['exit_minute'] = test['time_exit'].dt.minute
test['exit_second'] = test['time_exit'].dt.second

In [90]:
test2 = test[test['time_exit'].dt.hour.between(15,16)]

In [91]:
test2.count()

Unnamed: 0       33515
hash             33515
trajectory_id    33515
time_entry       33515
time_exit        33515
vmax                 0
vmin                 0
vmean                0
x_entry          33515
y_entry          33515
x_exit               0
y_exit               0
duration         33515
entry_hour       33515
entry_minute     33515
entry_second     33515
exit_hour        33515
exit_minute      33515
exit_second      33515
dtype: int64

In [128]:
devices = test.groupby('hash')
tname   = []
tlength = []
for name, group in devices:
    tname.append(name)
    tlength.append(group.shape[0])
tname   = pd.Series(tname)
tlength = pd.Series(tlength)

In [92]:
tX = test2.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'
              ,'vmax', 'vmin', 'vmean'], axis=1)


In [93]:
(X.shape, tX.shape)

((134063, 9), (33515, 9))

In [96]:
pred_y1 = models1.predict(tX)
pred_y2 = models2.predict(tX)
pred_y  = (pd.Series(pred_y1).between(3750901.5068, 3770901.5068) 
          & pd.Series(pred_y2).between((-19268905.6133), -19208905.6133))

In [97]:
print(pred_y.sum(), pred_y.shape)

10728 (33515,)


In [101]:
output = pd.DataFrame({'id': test2['trajectory_id'].tolist(), 'target': pred_y.astype(int).tolist()})
output.to_csv('submission5.csv', index=False, columns=['id', 'target'])