In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import seaborn as sns
%matplotlib inline

## 1. Preprocessing

In [3]:
data = pd.read_csv("data_train.csv")

In [4]:
data['time_entry'] = pd.to_datetime(data['time_entry'], format='%H:%M:%S')
data['time_exit']  = pd.to_datetime(data['time_exit'], format='%H:%M:%S')

In [6]:
data = data[data['time_exit'].dt.hour.between(15,16)]

In [7]:
data[:5]

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,duration
5,5,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_5,1900-01-01 15:02:31,1900-01-01 15:18:33,,,,3744945.0,-19281830.0,3744785.0,-19281480.0,962
9,9,0000cf177130469eeac79f67b6bcf3df_9,traj_0000cf177130469eeac79f67b6bcf3df_9_3,1900-01-01 15:00:32,1900-01-01 15:29:48,1.149404,1.149404,1.149404,3749088.0,-19266050.0,3749610.0,-19265940.0,1756
11,11,0001f97b99a80f18f62e2d44e54ef33d_3,traj_0001f97b99a80f18f62e2d44e54ef33d_3_1,1900-01-01 14:34:35,1900-01-01 15:19:51,30.167742,30.167742,30.167742,3758738.0,-19375940.0,3769687.0,-19142580.0,2716
20,20,0002124248b0ca510dea42824723ccac_31,traj_0002124248b0ca510dea42824723ccac_31_10,1900-01-01 15:28:54,1900-01-01 15:28:54,,,,3767866.0,-19177970.0,3767866.0,-19177970.0,0
28,28,000219c2a6380c307e8bffd85b5e404b_23,traj_000219c2a6380c307e8bffd85b5e404b_23_16,1900-01-01 15:08:05,1900-01-01 15:08:05,,,,3747641.0,-19226950.0,3747641.0,-19226950.0,0


## 1.2 Time Features

In [5]:
data['duration'] = (data['time_exit'] - data['time_entry']).dt.total_seconds().astype(int)

In [8]:
data['entry_hour'] = data['time_entry'].dt.hour
data['entry_minute'] = data['time_entry'].dt.minute
data['entry_second'] = data['time_entry'].dt.second
data['exit_hour'] = data['time_exit'].dt.hour
data['exit_minute'] = data['time_exit'].dt.minute
data['exit_second'] = data['time_exit'].dt.second

## 1.4 Train/Test Split

In [9]:
Y = data['x_exit'].between(3750901.5068, 3770901.5068) & data['y_exit'].between((-19268905.6133), -19208905.6133)

In [12]:
X = data.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'
              ,'vmax', 'vmin', 'vmean'], axis=1)


In [1]:
X[:5]

NameError: name 'X' is not defined

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=42)

In [16]:
train_x.shape

(107250, 9)

# Model Building

In [27]:
# 228 rounds
models = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=20000, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
# 3035
# models = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=20000,
#                            subsample=0.8, colsample_bytree=0.9, reg_alpha=65, seed=420, n_jobs=-1)

In [28]:
models.fit(train_x, train_y, eval_metric='auc', early_stopping_rounds=100,
           eval_set=[(train_x, train_y), (test_x, test_y)])


[0]	validation_0-auc:0.949732	validation_1-auc:0.94383
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.95157	validation_1-auc:0.945216
[2]	validation_0-auc:0.971434	validation_1-auc:0.965452
[3]	validation_0-auc:0.975089	validation_1-auc:0.969431
[4]	validation_0-auc:0.973711	validation_1-auc:0.967885
[5]	validation_0-auc:0.973943	validation_1-auc:0.965631
[6]	validation_0-auc:0.976369	validation_1-auc:0.96834
[7]	validation_0-auc:0.977861	validation_1-auc:0.969929
[8]	validation_0-auc:0.978966	validation_1-auc:0.971236
[9]	validation_0-auc:0.979822	validation_1-auc:0.972076
[10]	validation_0-auc:0.979724	validation_1-auc:0.970594
[11]	validation_0-auc:0.979238	validation_1-auc:0.970387
[12]	validation_0-auc:0.979047	validation_1-auc:0.968751
[13]	validation_0-auc:0.97876	validation_1-auc:0.96868
[14]	validation_0-auc:0.978402	validation_1-auc:0.96853
[15

[141]	validation_0-auc:0.990847	validation_1-auc:0.978325
[142]	validation_0-auc:0.990875	validation_1-auc:0.978358
[143]	validation_0-auc:0.990905	validation_1-auc:0.978387
[144]	validation_0-auc:0.990938	validation_1-auc:0.978396
[145]	validation_0-auc:0.990961	validation_1-auc:0.978417
[146]	validation_0-auc:0.99099	validation_1-auc:0.978428
[147]	validation_0-auc:0.991019	validation_1-auc:0.978468
[148]	validation_0-auc:0.991043	validation_1-auc:0.978487
[149]	validation_0-auc:0.991068	validation_1-auc:0.9785
[150]	validation_0-auc:0.991092	validation_1-auc:0.978514
[151]	validation_0-auc:0.991126	validation_1-auc:0.978539
[152]	validation_0-auc:0.991155	validation_1-auc:0.978563
[153]	validation_0-auc:0.991177	validation_1-auc:0.978584
[154]	validation_0-auc:0.991206	validation_1-auc:0.978583
[155]	validation_0-auc:0.991235	validation_1-auc:0.978581
[156]	validation_0-auc:0.991262	validation_1-auc:0.978595
[157]	validation_0-auc:0.991288	validation_1-auc:0.978617
[158]	validation_

[283]	validation_0-auc:0.993872	validation_1-auc:0.978893
[284]	validation_0-auc:0.993878	validation_1-auc:0.97889
[285]	validation_0-auc:0.993925	validation_1-auc:0.978883
[286]	validation_0-auc:0.993939	validation_1-auc:0.97889
[287]	validation_0-auc:0.993961	validation_1-auc:0.978893
[288]	validation_0-auc:0.993972	validation_1-auc:0.978886
[289]	validation_0-auc:0.994026	validation_1-auc:0.978894
[290]	validation_0-auc:0.994057	validation_1-auc:0.978892
[291]	validation_0-auc:0.994069	validation_1-auc:0.978893
[292]	validation_0-auc:0.994084	validation_1-auc:0.978887
[293]	validation_0-auc:0.994091	validation_1-auc:0.97889
[294]	validation_0-auc:0.994095	validation_1-auc:0.97889
[295]	validation_0-auc:0.994099	validation_1-auc:0.978888
[296]	validation_0-auc:0.994117	validation_1-auc:0.978889
[297]	validation_0-auc:0.994147	validation_1-auc:0.978891
[298]	validation_0-auc:0.994151	validation_1-auc:0.978893
[299]	validation_0-auc:0.994159	validation_1-auc:0.97889
[300]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=20000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

In [29]:
sorted(zip(train_x.columns.values, models.feature_importances_), key=lambda x: x[1], reverse=True)

[('y_entry', 0.42705464),
 ('entry_hour', 0.18417747),
 ('x_entry', 0.17137577),
 ('duration', 0.12009449),
 ('entry_minute', 0.026905913),
 ('exit_minute', 0.02566024),
 ('exit_second', 0.021583492),
 ('entry_second', 0.020886194),
 ('exit_hour', 0.0022618838)]

In [30]:
pred_y = models.predict(test_x)

  if diff:


In [31]:
(pred_y==test_y).sum()/test_y.count()

0.9360384887927498

In [32]:
models2 = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=228, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)

In [33]:
models2.fit(X, Y, eval_metric='auc', eval_set=[(X, Y)])


[0]	validation_0-auc:0.830886
[1]	validation_0-auc:0.95395
[2]	validation_0-auc:0.96865
[3]	validation_0-auc:0.962983
[4]	validation_0-auc:0.957715
[5]	validation_0-auc:0.966005
[6]	validation_0-auc:0.962969
[7]	validation_0-auc:0.968289
[8]	validation_0-auc:0.971821
[9]	validation_0-auc:0.974459
[10]	validation_0-auc:0.972739
[11]	validation_0-auc:0.974815
[12]	validation_0-auc:0.976323
[13]	validation_0-auc:0.97756
[14]	validation_0-auc:0.977959
[15]	validation_0-auc:0.976962
[16]	validation_0-auc:0.976022
[17]	validation_0-auc:0.977209
[18]	validation_0-auc:0.978276
[19]	validation_0-auc:0.97751
[20]	validation_0-auc:0.977965
[21]	validation_0-auc:0.978062
[22]	validation_0-auc:0.979016
[23]	validation_0-auc:0.979793
[24]	validation_0-auc:0.980425
[25]	validation_0-auc:0.981051
[26]	validation_0-auc:0.980573
[27]	validation_0-auc:0.981174
[28]	validation_0-auc:0.981362
[29]	validation_0-auc:0.981808
[30]	validation_0-auc:0.982255
[31]	validation_0-auc:0.98267
[32]	validation_0-auc:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=228,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

# Make Prediction

In [36]:
test = pd.read_csv("data_test.csv")

In [37]:
test['time_entry'] = pd.to_datetime(test['time_entry'], format='%H:%M:%S')
test['time_exit']  = pd.to_datetime(test['time_exit'], format='%H:%M:%S')

In [38]:
test['duration'] = (test['time_exit'] - test['time_entry']).dt.total_seconds().astype(int)

In [44]:
test['entry_hour'] = test['time_entry'].dt.hour
test['entry_minute'] = test['time_entry'].dt.minute
test['entry_second'] = test['time_entry'].dt.second
test['exit_hour'] = test['time_exit'].dt.hour
test['exit_minute'] = test['time_exit'].dt.minute
test['exit_second'] = test['time_exit'].dt.second

In [45]:
test2 = test[test['time_exit'].dt.hour.between(15,16)]

In [40]:
test2.count()

Unnamed: 0       33515
hash             33515
trajectory_id    33515
time_entry       33515
time_exit        33515
vmax                 0
vmin                 0
vmean                0
x_entry          33515
y_entry          33515
x_exit               0
y_exit               0
duration         33515
dtype: int64

In [128]:
devices = test.groupby('hash')
tname   = []
tlength = []
for name, group in devices:
    tname.append(name)
    tlength.append(group.shape[0])
tname   = pd.Series(tname)
tlength = pd.Series(tlength)

In [46]:
tX = test2.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'
              ,'vmax', 'vmin', 'vmean'], axis=1)


In [47]:
(X.shape, tX.shape)

((134063, 9), (33515, 9))

In [48]:
pred_y = models2.predict(tX)

  if diff:


In [49]:
print(pred_y.sum(), pred_y.shape)

8202 (33515,)


In [50]:
output = pd.DataFrame({'id': test2['trajectory_id'], 'target': pred_y.astype(int)})
output.to_csv('submission4.csv', index=False, columns=['id', 'target'])

In [52]:
test2['vmean'].unique()

4        NaN
7        NaN
10       NaN
13       NaN
17       NaN
23       NaN
27       NaN
31       NaN
34       NaN
39       NaN
45       NaN
56       NaN
63       NaN
66       NaN
69       NaN
72       NaN
77       NaN
79       NaN
87       NaN
94       NaN
100      NaN
103      NaN
111      NaN
116      NaN
120      NaN
131      NaN
141      NaN
144      NaN
149      NaN
152      NaN
          ..
202758   NaN
202766   NaN
202771   NaN
202775   NaN
202783   NaN
202791   NaN
202794   NaN
202800   NaN
202803   NaN
202807   NaN
202810   NaN
202813   NaN
202818   NaN
202828   NaN
202831   NaN
202838   NaN
202843   NaN
202852   NaN
202859   NaN
202862   NaN
202866   NaN
202869   NaN
202877   NaN
202885   NaN
202889   NaN
202899   NaN
202913   NaN
202914   NaN
202929   NaN
202936   NaN
Name: vmean, Length: 33515, dtype: float64