In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import seaborn as sns
%matplotlib inline

## 1. Preprocessing

In [2]:
data = pd.read_csv("data_train.csv")

In [3]:
data['time_entry'] = pd.to_datetime(data['time_entry'], format='%H:%M:%S')
data['time_exit']  = pd.to_datetime(data['time_exit'], format='%H:%M:%S')

In [4]:
data['duration'] = (data['time_exit'] - data['time_entry']).dt.total_seconds().astype(int)

In [5]:
data = data[data['time_exit'].dt.hour.between(15,16)]

In [6]:
data[:5]

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,duration
5,5,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_5,1900-01-01 15:02:31,1900-01-01 15:18:33,,,,3744945.0,-19281830.0,3744785.0,-19281480.0,962
9,9,0000cf177130469eeac79f67b6bcf3df_9,traj_0000cf177130469eeac79f67b6bcf3df_9_3,1900-01-01 15:00:32,1900-01-01 15:29:48,1.149404,1.149404,1.149404,3749088.0,-19266050.0,3749610.0,-19265940.0,1756
11,11,0001f97b99a80f18f62e2d44e54ef33d_3,traj_0001f97b99a80f18f62e2d44e54ef33d_3_1,1900-01-01 14:34:35,1900-01-01 15:19:51,30.167742,30.167742,30.167742,3758738.0,-19375940.0,3769687.0,-19142580.0,2716
20,20,0002124248b0ca510dea42824723ccac_31,traj_0002124248b0ca510dea42824723ccac_31_10,1900-01-01 15:28:54,1900-01-01 15:28:54,,,,3767866.0,-19177970.0,3767866.0,-19177970.0,0
28,28,000219c2a6380c307e8bffd85b5e404b_23,traj_000219c2a6380c307e8bffd85b5e404b_23_16,1900-01-01 15:08:05,1900-01-01 15:08:05,,,,3747641.0,-19226950.0,3747641.0,-19226950.0,0


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
# might neet larger range(15) for test data
entryHEnc = OneHotEncoder(sparse=False, dtype=bool)
hour = entryHEnc.fit_transform(data['time_entry'].dt.hour.values.reshape(-1,1))
data = pd.concat([data, pd.DataFrame(hour, columns=['hour_'+str(entryHEnc.active_features_[n]) for n in range(hour.shape[1])], 
                                      index=data.index)], axis=1)
entryMEnc = OneHotEncoder(sparse=False, dtype=bool)
minute = entryMEnc.fit_transform(data['time_entry'].dt.minute.values.reshape(-1,1))
data = pd.concat([data, pd.DataFrame(minute, columns=['minute_'+str(entryMEnc.active_features_[n]) for n in range(minute.shape[1])], 
                                      index=data.index)], axis=1)

In [9]:
exitHEnc = OneHotEncoder(sparse=False, dtype=bool)
ehour = exitHEnc.fit_transform(data['time_exit'].dt.hour.values.reshape(-1,1))
data = pd.concat([data, pd.DataFrame(ehour, columns=['ehour_'+str(exitHEnc.active_features_[n]) for n in range(ehour.shape[1])], 
                                      index=data.index)], axis=1)
exitMEnc = OneHotEncoder(sparse=False, dtype=bool)
eminute = exitMEnc.fit_transform(data['time_exit'].dt.minute.values.reshape(-1,1))
data = pd.concat([data, pd.DataFrame(eminute, columns=['eminute_'+str(exitMEnc.active_features_[n]) for n in range(minute.shape[1])], 
                                      index=data.index)], axis=1)

In [10]:
Y = data['x_exit'].between(3750901.5068, 3770901.5068) & data['y_exit'].between((-19268905.6133), -19208905.6133)

In [11]:
X = data.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'], axis=1)


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
train_x.shape

(107250, 135)

# Model Building

In [15]:
models = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=20000, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)

In [16]:
models.fit(train_x, train_y, eval_metric='auc', early_stopping_rounds=100,
           eval_set=[(train_x, train_y), (test_x, test_y)])


[0]	validation_0-auc:0.538435	validation_1-auc:0.52464
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.831303	validation_1-auc:0.808397
[2]	validation_0-auc:0.96329	validation_1-auc:0.951391
[3]	validation_0-auc:0.970477	validation_1-auc:0.960032
[4]	validation_0-auc:0.964767	validation_1-auc:0.951535
[5]	validation_0-auc:0.959134	validation_1-auc:0.945946
[6]	validation_0-auc:0.967465	validation_1-auc:0.954902
[7]	validation_0-auc:0.972002	validation_1-auc:0.959983
[8]	validation_0-auc:0.974976	validation_1-auc:0.963218
[9]	validation_0-auc:0.972362	validation_1-auc:0.95954
[10]	validation_0-auc:0.974974	validation_1-auc:0.96258
[11]	validation_0-auc:0.976819	validation_1-auc:0.964774
[12]	validation_0-auc:0.978118	validation_1-auc:0.966575
[13]	validation_0-auc:0.978476	validation_1-auc:0.96726
[14]	validation_0-auc:0.979291	validation_1-auc:0.968248
[1

[141]	validation_0-auc:0.988712	validation_1-auc:0.978684
[142]	validation_0-auc:0.988746	validation_1-auc:0.978694
[143]	validation_0-auc:0.988766	validation_1-auc:0.978699
[144]	validation_0-auc:0.988785	validation_1-auc:0.978706
[145]	validation_0-auc:0.988799	validation_1-auc:0.978706
[146]	validation_0-auc:0.988829	validation_1-auc:0.978729
[147]	validation_0-auc:0.988863	validation_1-auc:0.97875
[148]	validation_0-auc:0.988888	validation_1-auc:0.978772
[149]	validation_0-auc:0.988919	validation_1-auc:0.978786
[150]	validation_0-auc:0.988942	validation_1-auc:0.978816
[151]	validation_0-auc:0.988967	validation_1-auc:0.978807
[152]	validation_0-auc:0.989	validation_1-auc:0.978836
[153]	validation_0-auc:0.989033	validation_1-auc:0.978854
[154]	validation_0-auc:0.989055	validation_1-auc:0.978866
[155]	validation_0-auc:0.989069	validation_1-auc:0.97886
[156]	validation_0-auc:0.989084	validation_1-auc:0.978863
[157]	validation_0-auc:0.989101	validation_1-auc:0.978862
[158]	validation_0-

[283]	validation_0-auc:0.99133	validation_1-auc:0.979282
[284]	validation_0-auc:0.991352	validation_1-auc:0.979289
[285]	validation_0-auc:0.991362	validation_1-auc:0.979289
[286]	validation_0-auc:0.991386	validation_1-auc:0.97929
[287]	validation_0-auc:0.991396	validation_1-auc:0.979288
[288]	validation_0-auc:0.991403	validation_1-auc:0.979289
[289]	validation_0-auc:0.991412	validation_1-auc:0.979288
[290]	validation_0-auc:0.991417	validation_1-auc:0.979291
[291]	validation_0-auc:0.991432	validation_1-auc:0.979289
[292]	validation_0-auc:0.991446	validation_1-auc:0.97929
[293]	validation_0-auc:0.991457	validation_1-auc:0.979289
[294]	validation_0-auc:0.991471	validation_1-auc:0.979289
[295]	validation_0-auc:0.991488	validation_1-auc:0.979288
[296]	validation_0-auc:0.991491	validation_1-auc:0.979285
[297]	validation_0-auc:0.991504	validation_1-auc:0.979285
[298]	validation_0-auc:0.991517	validation_1-auc:0.979281
[299]	validation_0-auc:0.991531	validation_1-auc:0.979284
[300]	validation_

[425]	validation_0-auc:0.993138	validation_1-auc:0.979239
[426]	validation_0-auc:0.993146	validation_1-auc:0.97924
Stopping. Best iteration:
[326]	validation_0-auc:0.991927	validation_1-auc:0.979322



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=20000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

In [17]:
sorted(zip(train_x.columns.values, models.feature_importances_), key=lambda x: x[1], reverse=True)

[('y_entry', 0.09221833),
 ('hour_15', 0.048976768),
 ('x_entry', 0.03863641),
 ('duration', 0.028115653),
 ('hour_14', 0.0093169585),
 ('minute_13', 0.008715211),
 ('minute_6', 0.008353286),
 ('eminute_54', 0.008014485),
 ('eminute_11', 0.008009848),
 ('minute_35', 0.00793254),
 ('minute_37', 0.007853216),
 ('eminute_48', 0.007819084),
 ('eminute_55', 0.007811645),
 ('minute_15', 0.0077561117),
 ('eminute_25', 0.007724879),
 ('vmean', 0.0074633094),
 ('eminute_0', 0.0074485424),
 ('eminute_26', 0.0074340864),
 ('minute_53', 0.007339699),
 ('minute_49', 0.007315526),
 ('eminute_53', 0.0072835307),
 ('eminute_23', 0.0072233994),
 ('minute_12', 0.007126608),
 ('eminute_42', 0.007089619),
 ('minute_59', 0.0070661516),
 ('eminute_16', 0.007033363),
 ('eminute_29', 0.006918468),
 ('eminute_21', 0.006905095),
 ('eminute_6', 0.006903897),
 ('minute_9', 0.0068666134),
 ('minute_46', 0.0068558753),
 ('eminute_18', 0.006852153),
 ('minute_24', 0.0068235043),
 ('minute_31', 0.0068089776),
 ('minu

In [18]:
pred_y = models.predict(test_x)

  if diff:


In [19]:
(pred_y==test_y).sum()/test_y.count()

0.9357028307164436

In [34]:
models2 = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=326, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)

In [35]:
models2.fit(X, Y, eval_metric='auc', eval_set=[(X, Y)])


[0]	validation_0-auc:0.979616
[1]	validation_0-auc:0.980679
[2]	validation_0-auc:0.981397
[3]	validation_0-auc:0.981705
[4]	validation_0-auc:0.981989
[5]	validation_0-auc:0.980222
[6]	validation_0-auc:0.980309
[7]	validation_0-auc:0.980898
[8]	validation_0-auc:0.981305
[9]	validation_0-auc:0.981443
[10]	validation_0-auc:0.981857
[11]	validation_0-auc:0.982085
[12]	validation_0-auc:0.982331
[13]	validation_0-auc:0.982668
[14]	validation_0-auc:0.982848
[15]	validation_0-auc:0.982996
[16]	validation_0-auc:0.983065
[17]	validation_0-auc:0.983021
[18]	validation_0-auc:0.9832
[19]	validation_0-auc:0.983333
[20]	validation_0-auc:0.983448
[21]	validation_0-auc:0.983561
[22]	validation_0-auc:0.983565
[23]	validation_0-auc:0.983673
[24]	validation_0-auc:0.983588
[25]	validation_0-auc:0.983441
[26]	validation_0-auc:0.983428
[27]	validation_0-auc:0.983558
[28]	validation_0-auc:0.983371
[29]	validation_0-auc:0.983582
[30]	validation_0-auc:0.983408
[31]	validation_0-auc:0.983592
[32]	validation_0-au

[261]	validation_0-auc:0.990048
[262]	validation_0-auc:0.990063
[263]	validation_0-auc:0.990076
[264]	validation_0-auc:0.990086
[265]	validation_0-auc:0.990101
[266]	validation_0-auc:0.990106
[267]	validation_0-auc:0.99012
[268]	validation_0-auc:0.990136
[269]	validation_0-auc:0.990168
[270]	validation_0-auc:0.990182
[271]	validation_0-auc:0.990197
[272]	validation_0-auc:0.990205
[273]	validation_0-auc:0.990216
[274]	validation_0-auc:0.990227
[275]	validation_0-auc:0.990241
[276]	validation_0-auc:0.99025
[277]	validation_0-auc:0.990267
[278]	validation_0-auc:0.990277
[279]	validation_0-auc:0.990288
[280]	validation_0-auc:0.990305
[281]	validation_0-auc:0.990312
[282]	validation_0-auc:0.990323
[283]	validation_0-auc:0.990336
[284]	validation_0-auc:0.990353
[285]	validation_0-auc:0.990361
[286]	validation_0-auc:0.990385
[287]	validation_0-auc:0.990403
[288]	validation_0-auc:0.990409
[289]	validation_0-auc:0.990415
[290]	validation_0-auc:0.990425
[291]	validation_0-auc:0.990436
[292]	vali

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=326,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

# Make Prediction

In [36]:
test = pd.read_csv("data_test.csv")

In [37]:
test['time_entry'] = pd.to_datetime(test['time_entry'], format='%H:%M:%S')
test['time_exit']  = pd.to_datetime(test['time_exit'], format='%H:%M:%S')

In [38]:
test['duration'] = (test['time_exit'] - test['time_entry']).dt.total_seconds().astype(int)

In [39]:
test2 = test[test['time_exit'].dt.hour.between(15,16)]

In [40]:
test2.count()

Unnamed: 0       33515
hash             33515
trajectory_id    33515
time_entry       33515
time_exit        33515
vmax                 0
vmin                 0
vmean                0
x_entry          33515
y_entry          33515
x_exit               0
y_exit               0
duration         33515
dtype: int64

In [128]:
devices = test.groupby('hash')
tname   = []
tlength = []
for name, group in devices:
    tname.append(name)
    tlength.append(group.shape[0])
tname   = pd.Series(tname)
tlength = pd.Series(tlength)

In [41]:
thour = entryHEnc.transform(test2['time_entry'].dt.hour.values.reshape(-1,1))
test2 = pd.concat([test2, pd.DataFrame(thour, columns=['hour_'+str(entryHEnc.active_features_[n]) for n in range(thour.shape[1])], 
                                      index=test2.index)], axis=1)

tminute = entryMEnc.transform(test2['time_entry'].dt.minute.values.reshape(-1,1))
test2 = pd.concat([test2, pd.DataFrame(tminute, columns=['minute_'+str(entryMEnc.active_features_[n]) for n in range(minute.shape[1])], 
                                      index=test2.index)], axis=1)

In [42]:
tehour = exitHEnc.transform(test2['time_exit'].dt.hour.values.reshape(-1,1))
test2 = pd.concat([test2, pd.DataFrame(tehour, columns=['ehour_'+str(exitHEnc.active_features_[n]) for n in range(tehour.shape[1])], 
                                      index=test2.index)], axis=1)

teminute = exitMEnc.transform(test2['time_exit'].dt.minute.values.reshape(-1,1))
test2 = pd.concat([test2, pd.DataFrame(teminute, columns=['eminute_'+str(exitMEnc.active_features_[n]) for n in range(teminute.shape[1])], 
                                      index=test2.index)], axis=1)

In [43]:
tX = test2.drop(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit', 'x_exit', 'y_exit'], axis=1)


In [47]:
pred_y = models2.predict(tX)

  if diff:


In [48]:
print(pred_y.sum(), pred_y.shape)

8199 (33515,)


In [49]:
output = pd.DataFrame({'id': test2['trajectory_id'], 'target': pred_y.astype(int)})
output.to_csv('submission3.csv', index=False, columns=['id', 'target'])