In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
import seaborn as sns
from tqdm import tqdm
%matplotlib inline

# 1. Preprocess

In [10]:
data = pd.read_csv("data/raw/data_train.csv")

In [11]:
data['time_entry'] = pd.to_datetime(data['time_entry'], format='%H:%M:%S')
data['time_exit']  = pd.to_datetime(data['time_exit'], format='%H:%M:%S')

In [12]:
data[:5]

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit
0,0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,1900-01-01 07:04:31,1900-01-01 07:08:32,,,,3751014.0,-19093980.0,3750326.0,-19136340.0
1,1,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_1,1900-01-01 07:20:34,1900-01-01 07:25:42,,,,3743937.0,-19322470.0,3744975.0,-19319660.0
2,2,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_2,1900-01-01 07:53:32,1900-01-01 08:03:25,,,,3744868.0,-19293560.0,3744816.0,-19292840.0
3,3,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_3,1900-01-01 08:17:50,1900-01-01 08:37:23,,,,3744880.0,-19292290.0,3744809.0,-19290490.0
4,4,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_4,1900-01-01 14:38:09,1900-01-01 14:38:09,,,,3744909.0,-19285580.0,3744909.0,-19285580.0


## 1.2 Time Feature

In [13]:
data['duration'] = (data['time_exit'] - data['time_entry']).dt.total_seconds().astype(int)

In [14]:
data['entry_hour'] = data['time_entry'].dt.hour
data['entry_minute'] = data['time_entry'].dt.minute
data['entry_second'] = data['time_entry'].dt.second
data['exit_hour'] = data['time_exit'].dt.hour
data['exit_minute'] = data['time_exit'].dt.minute
data['exit_second'] = data['time_exit'].dt.second

In [15]:
len(data[:5].columns)

19

In [16]:
data[:5].columns

Index(['Unnamed: 0', 'hash', 'trajectory_id', 'time_entry', 'time_exit',
       'vmax', 'vmin', 'vmean', 'x_entry', 'y_entry', 'x_exit', 'y_exit',
       'duration', 'entry_hour', 'entry_minute', 'entry_second', 'exit_hour',
       'exit_minute', 'exit_second'],
      dtype='object')

In [17]:
data[:5]

Unnamed: 0.1,Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,duration,entry_hour,entry_minute,entry_second,exit_hour,exit_minute,exit_second
0,0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,1900-01-01 07:04:31,1900-01-01 07:08:32,,,,3751014.0,-19093980.0,3750326.0,-19136340.0,241,7,4,31,7,8,32
1,1,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_1,1900-01-01 07:20:34,1900-01-01 07:25:42,,,,3743937.0,-19322470.0,3744975.0,-19319660.0,308,7,20,34,7,25,42
2,2,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_2,1900-01-01 07:53:32,1900-01-01 08:03:25,,,,3744868.0,-19293560.0,3744816.0,-19292840.0,593,7,53,32,8,3,25
3,3,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_3,1900-01-01 08:17:50,1900-01-01 08:37:23,,,,3744880.0,-19292290.0,3744809.0,-19290490.0,1173,8,17,50,8,37,23
4,4,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_4,1900-01-01 14:38:09,1900-01-01 14:38:09,,,,3744909.0,-19285580.0,3744909.0,-19285580.0,0,14,38,9,14,38,9


## 1.3 Concat Time Series

concatenate trajectories for each device into a row

In [11]:
groups = data.groupby("hash")

In [13]:
dependence = []
pbar = tqdm(total=len(groups))
for name, g in groups:
    pbar.update(1)
    available = [g.iloc[i] for i in range(len(g))]
    nanpaddin = [pd.Series([np.nan]*(20*len(data[:5].columns)-sum([len(x) for x in available])))]
    dependence.append(pd.concat(nanpaddin+available, axis=0, ignore_index=True))    
pbar.close()

100%|██████████| 134063/134063 [04:34<00:00, 488.87it/s]


In [14]:
data2 = pd.concat(dependence, axis=1).T

In [None]:
data2.columns = [x + "_" + str(y) for y in range(20) for x in data.columns]

In [18]:
data2[:5]

Unnamed: 0,Unnamed: 0_0,hash_0,trajectory_id_0,time_entry_0,time_exit_0,vmax_0,vmin_0,vmean_0,x_entry_0,y_entry_0,...,y_entry_19,x_exit_19,y_exit_19,duration_19,entry_hour_19,entry_minute_19,entry_second_19,exit_hour_19,exit_minute_19,exit_second_19
0,,,,,,,,,,,...,-19281800.0,3744790.0,-19281500.0,962,15,2,31,15,18,33
1,,,,,,,,,,,...,-19266100.0,3749610.0,-19265900.0,1756,15,0,32,15,29,48
2,,,,,,,,,,,...,-19375900.0,3769690.0,-19142600.0,2716,14,34,35,15,19,51
3,,,,,,,,,,,...,-19178000.0,3767870.0,-19178000.0,0,15,28,54,15,28,54
4,,,,,,,,,,,...,-19227000.0,3747640.0,-19227000.0,0,15,8,5,15,8,5


In [21]:
data2.to_csv('data_train_concat.csv')

In [20]:
# read saved
data2 = pd.read_csv("data_train_concat.csv", index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


## 1.4 Train/Test Split

In [22]:
Y = data2['x_exit_19'].between(3750901.5068, 3770901.5068) \
    & data2['y_exit_19'].between((-19268905.6133), -19208905.6133)

In [26]:
X = data2.drop([x+str(y) for y in range(20) for x in ['Unnamed: 0_', 'hash_', 'trajectory_id_', 'time_entry_', 'time_exit_']] \
    + ['x_exit_19', 'y_exit_19','vmax_19', 'vmin_19', 'vmean_19'] + ['Unnamed: 0'], axis=1)
# X = X.fillna(0)

In [27]:
from sklearn.model_selection import train_test_split

In [31]:
X[:5]

Unnamed: 0,vmax_0,vmin_0,vmean_0,x_entry_0,y_entry_0,x_exit_0,y_exit_0,duration_0,entry_hour_0,entry_minute_0,...,exit_second_18,x_entry_19,y_entry_19,duration_19,entry_hour_19,entry_minute_19,entry_second_19,exit_hour_19,exit_minute_19,exit_second_19
0,,,,,,,,,,,...,9.0,3744945.0,-19281830.0,962,15,2,31,15,18,33
1,,,,,,,,,,,...,14.0,3749088.0,-19266050.0,1756,15,0,32,15,29,48
2,,,,,,,,,,,...,8.0,3758738.0,-19375940.0,2716,14,34,35,15,19,51
3,,,,,,,,,,,...,1.0,3767866.0,-19177970.0,0,15,28,54,15,28,54
4,,,,,,,,,,,...,54.0,3747641.0,-19226950.0,0,15,8,5,15,8,5


In [29]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2, random_state=42)

In [30]:
(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

((107250, 275), (26813, 275), (107250,), (26813,))

# 2. Model Building

In [37]:
# 488 rounds (0.982671)
models = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=20000, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)
#
# models = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=20000,
#                            subsample=0.8, colsample_bytree=0.9, reg_alpha=65, seed=420, n_jobs=-1)

In [38]:
models.fit(train_x, train_y, eval_metric='auc', early_stopping_rounds=100,
           eval_set=[(train_x, train_y), (test_x, test_y)])


[0]	validation_0-auc:0.981442	validation_1-auc:0.974972
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.981056	validation_1-auc:0.97317
[2]	validation_0-auc:0.983749	validation_1-auc:0.975911
[3]	validation_0-auc:0.985117	validation_1-auc:0.977088
[4]	validation_0-auc:0.985721	validation_1-auc:0.977639
[5]	validation_0-auc:0.986147	validation_1-auc:0.978158
[6]	validation_0-auc:0.986214	validation_1-auc:0.977672
[7]	validation_0-auc:0.986643	validation_1-auc:0.978128
[8]	validation_0-auc:0.986881	validation_1-auc:0.978554
[9]	validation_0-auc:0.986713	validation_1-auc:0.97786
[10]	validation_0-auc:0.987076	validation_1-auc:0.978197
[11]	validation_0-auc:0.987284	validation_1-auc:0.978506
[12]	validation_0-auc:0.987482	validation_1-auc:0.978766
[13]	validation_0-auc:0.987722	validation_1-auc:0.979002
[14]	validation_0-auc:0.987666	validation_1-auc:0.978589

[141]	validation_0-auc:0.995257	validation_1-auc:0.981956
[142]	validation_0-auc:0.995286	validation_1-auc:0.981971
[143]	validation_0-auc:0.995315	validation_1-auc:0.981981
[144]	validation_0-auc:0.995346	validation_1-auc:0.981999
[145]	validation_0-auc:0.995372	validation_1-auc:0.982018
[146]	validation_0-auc:0.995415	validation_1-auc:0.982027
[147]	validation_0-auc:0.995442	validation_1-auc:0.982026
[148]	validation_0-auc:0.995481	validation_1-auc:0.982053
[149]	validation_0-auc:0.995506	validation_1-auc:0.982083
[150]	validation_0-auc:0.99553	validation_1-auc:0.982084
[151]	validation_0-auc:0.995554	validation_1-auc:0.982095
[152]	validation_0-auc:0.995592	validation_1-auc:0.982097
[153]	validation_0-auc:0.995626	validation_1-auc:0.982091
[154]	validation_0-auc:0.995658	validation_1-auc:0.982092
[155]	validation_0-auc:0.995681	validation_1-auc:0.982086
[156]	validation_0-auc:0.995706	validation_1-auc:0.98211
[157]	validation_0-auc:0.995738	validation_1-auc:0.982102
[158]	validation

[283]	validation_0-auc:0.997916	validation_1-auc:0.982485
[284]	validation_0-auc:0.997923	validation_1-auc:0.982483
[285]	validation_0-auc:0.99794	validation_1-auc:0.982488
[286]	validation_0-auc:0.997955	validation_1-auc:0.982487
[287]	validation_0-auc:0.997972	validation_1-auc:0.982485
[288]	validation_0-auc:0.997991	validation_1-auc:0.982483
[289]	validation_0-auc:0.998019	validation_1-auc:0.982485
[290]	validation_0-auc:0.998038	validation_1-auc:0.982491
[291]	validation_0-auc:0.998042	validation_1-auc:0.982493
[292]	validation_0-auc:0.998051	validation_1-auc:0.982493
[293]	validation_0-auc:0.998057	validation_1-auc:0.982493
[294]	validation_0-auc:0.998076	validation_1-auc:0.982492
[295]	validation_0-auc:0.998092	validation_1-auc:0.982496
[296]	validation_0-auc:0.998097	validation_1-auc:0.982502
[297]	validation_0-auc:0.998109	validation_1-auc:0.9825
[298]	validation_0-auc:0.998118	validation_1-auc:0.982505
[299]	validation_0-auc:0.998125	validation_1-auc:0.982503
[300]	validation_

[425]	validation_0-auc:0.999208	validation_1-auc:0.982596
[426]	validation_0-auc:0.99921	validation_1-auc:0.982596
[427]	validation_0-auc:0.999221	validation_1-auc:0.982597
[428]	validation_0-auc:0.999224	validation_1-auc:0.982598
[429]	validation_0-auc:0.999228	validation_1-auc:0.982596
[430]	validation_0-auc:0.99923	validation_1-auc:0.982598
[431]	validation_0-auc:0.999236	validation_1-auc:0.982606
[432]	validation_0-auc:0.999243	validation_1-auc:0.982607
[433]	validation_0-auc:0.99925	validation_1-auc:0.982611
[434]	validation_0-auc:0.999254	validation_1-auc:0.982614
[435]	validation_0-auc:0.999265	validation_1-auc:0.982615
[436]	validation_0-auc:0.999269	validation_1-auc:0.982613
[437]	validation_0-auc:0.999277	validation_1-auc:0.982612
[438]	validation_0-auc:0.999281	validation_1-auc:0.982613
[439]	validation_0-auc:0.999284	validation_1-auc:0.982612
[440]	validation_0-auc:0.999289	validation_1-auc:0.982614
[441]	validation_0-auc:0.999291	validation_1-auc:0.982611
[442]	validation_

[567]	validation_0-auc:0.999753	validation_1-auc:0.982639
[568]	validation_0-auc:0.999754	validation_1-auc:0.982642
[569]	validation_0-auc:0.999756	validation_1-auc:0.982647
[570]	validation_0-auc:0.999759	validation_1-auc:0.982642
[571]	validation_0-auc:0.999762	validation_1-auc:0.982637
[572]	validation_0-auc:0.999763	validation_1-auc:0.982639
[573]	validation_0-auc:0.999767	validation_1-auc:0.982636
[574]	validation_0-auc:0.99977	validation_1-auc:0.982636
[575]	validation_0-auc:0.999771	validation_1-auc:0.982638
[576]	validation_0-auc:0.999772	validation_1-auc:0.982636
[577]	validation_0-auc:0.999774	validation_1-auc:0.982634
[578]	validation_0-auc:0.999775	validation_1-auc:0.982633
[579]	validation_0-auc:0.999776	validation_1-auc:0.982634
[580]	validation_0-auc:0.999777	validation_1-auc:0.982634
[581]	validation_0-auc:0.999778	validation_1-auc:0.982631
[582]	validation_0-auc:0.999779	validation_1-auc:0.982629
[583]	validation_0-auc:0.999781	validation_1-auc:0.982628
[584]	validatio

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=20000,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

In [36]:
sorted(zip(train_x.columns.values, models.feature_importances_), key=lambda x: x[1], reverse=True)

[('y_entry_19', 0.11827931),
 ('x_entry_19', 0.045527466),
 ('entry_hour_19', 0.039579768),
 ('y_exit_18', 0.028969163),
 ('duration_19', 0.026214631),
 ('x_exit_18', 0.012290163),
 ('y_entry_18', 0.01015353),
 ('entry_minute_19', 0.006046941),
 ('y_entry_17', 0.005836972),
 ('y_exit_17', 0.0054184766),
 ('x_exit_3', 0.004950177),
 ('x_entry_18', 0.0048198737),
 ('vmin_7', 0.004452624),
 ('y_entry_16', 0.0043693576),
 ('x_entry_17', 0.0043582167),
 ('y_entry_13', 0.0043359944),
 ('y_exit_3', 0.004331405),
 ('exit_minute_19', 0.0043258774),
 ('vmax_5', 0.004317841),
 ('y_entry_15', 0.004277213),
 ('vmin_6', 0.0041670823),
 ('exit_hour_10', 0.0040657567),
 ('y_entry_12', 0.0040165023),
 ('x_exit_17', 0.0039712465),
 ('entry_minute_5', 0.0039380398),
 ('x_entry_2', 0.0039190133),
 ('vmean_17', 0.0038993075),
 ('vmean_16', 0.0038910958),
 ('vmean_4', 0.0038870217),
 ('y_entry_10', 0.00381014),
 ('vmean_7', 0.003795267),
 ('vmax_13', 0.0037929155),
 ('x_entry_16', 0.0037673796),
 ('y_exit_1

In [39]:
pred_y = models.predict(test_x)

  if diff:


In [40]:
(pred_y==test_y).sum()/test_y.count()

0.9411479506209675

In [63]:
models2 = xgb.XGBClassifier(max_depth=10, learning_rate=0.03, n_estimators=488, n_jobs=-1,
                           subsample=0.8, colsample_bytree=0.8, seed=420)

In [64]:
models2.fit(X, Y, eval_metric='auc', eval_set=[(X, Y)])


[0]	validation_0-auc:0.982064
[1]	validation_0-auc:0.984131
[2]	validation_0-auc:0.983687
[3]	validation_0-auc:0.983352
[4]	validation_0-auc:0.982387
[5]	validation_0-auc:0.983898
[6]	validation_0-auc:0.984237
[7]	validation_0-auc:0.98484
[8]	validation_0-auc:0.984562
[9]	validation_0-auc:0.985143
[10]	validation_0-auc:0.985611
[11]	validation_0-auc:0.986041
[12]	validation_0-auc:0.986351
[13]	validation_0-auc:0.986667
[14]	validation_0-auc:0.986602
[15]	validation_0-auc:0.986897
[16]	validation_0-auc:0.987095
[17]	validation_0-auc:0.987283
[18]	validation_0-auc:0.987361
[19]	validation_0-auc:0.987561
[20]	validation_0-auc:0.98759
[21]	validation_0-auc:0.987479
[22]	validation_0-auc:0.987663
[23]	validation_0-auc:0.98753
[24]	validation_0-auc:0.987672
[25]	validation_0-auc:0.987734
[26]	validation_0-auc:0.987657
[27]	validation_0-auc:0.987835
[28]	validation_0-auc:0.988052
[29]	validation_0-auc:0.988209
[30]	validation_0-auc:0.988252
[31]	validation_0-auc:0.988393
[32]	validation_0-auc

[261]	validation_0-auc:0.997115
[262]	validation_0-auc:0.99713
[263]	validation_0-auc:0.997142
[264]	validation_0-auc:0.997155
[265]	validation_0-auc:0.99716
[266]	validation_0-auc:0.997169
[267]	validation_0-auc:0.997178
[268]	validation_0-auc:0.997188
[269]	validation_0-auc:0.997206
[270]	validation_0-auc:0.997218
[271]	validation_0-auc:0.997225
[272]	validation_0-auc:0.997239
[273]	validation_0-auc:0.997257
[274]	validation_0-auc:0.997266
[275]	validation_0-auc:0.997271
[276]	validation_0-auc:0.997278
[277]	validation_0-auc:0.997283
[278]	validation_0-auc:0.997285
[279]	validation_0-auc:0.997304
[280]	validation_0-auc:0.997322
[281]	validation_0-auc:0.997338
[282]	validation_0-auc:0.997348
[283]	validation_0-auc:0.997359
[284]	validation_0-auc:0.997367
[285]	validation_0-auc:0.997381
[286]	validation_0-auc:0.997413
[287]	validation_0-auc:0.997429
[288]	validation_0-auc:0.997434
[289]	validation_0-auc:0.997442
[290]	validation_0-auc:0.99746
[291]	validation_0-auc:0.997473
[292]	valid

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.03, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=488,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=420, silent=True, subsample=0.8)

In [65]:
X[:5]

Unnamed: 0,vmax_0,vmin_0,vmean_0,x_entry_0,y_entry_0,x_exit_0,y_exit_0,duration_0,entry_hour_0,entry_minute_0,...,exit_second_18,x_entry_19,y_entry_19,duration_19,entry_hour_19,entry_minute_19,entry_second_19,exit_hour_19,exit_minute_19,exit_second_19
0,,,,,,,,,,,...,9.0,3744945.0,-19281830.0,962,15,2,31,15,18,33
1,,,,,,,,,,,...,14.0,3749088.0,-19266050.0,1756,15,0,32,15,29,48
2,,,,,,,,,,,...,8.0,3758738.0,-19375940.0,2716,14,34,35,15,19,51
3,,,,,,,,,,,...,1.0,3767866.0,-19177970.0,0,15,28,54,15,28,54
4,,,,,,,,,,,...,54.0,3747641.0,-19226950.0,0,15,8,5,15,8,5


# 3. Make Prediction

In [41]:
test = pd.read_csv("data/raw/data_test.csv")

In [42]:
test['time_entry'] = pd.to_datetime(test['time_entry'], format='%H:%M:%S')
test['time_exit']  = pd.to_datetime(test['time_exit'], format='%H:%M:%S')

In [43]:
test['duration'] = (test['time_exit'] - test['time_entry']).dt.total_seconds().astype(int)

In [44]:
test['entry_hour'] = test['time_entry'].dt.hour
test['entry_minute'] = test['time_entry'].dt.minute
test['entry_second'] = test['time_entry'].dt.second
test['exit_hour'] = test['time_exit'].dt.hour
test['exit_minute'] = test['time_exit'].dt.minute
test['exit_second'] = test['time_exit'].dt.second

In [45]:
tgroups = test.groupby("hash")

In [46]:
tdependence = []
pbar = tqdm(total=len(tgroups))
for name, g in tgroups:
    pbar.update(1)
    available = [g.iloc[i] for i in range(len(g))]
    nanpaddin = [pd.Series([np.nan]*(20*len(test[:5].columns)-sum([len(x) for x in available])))]
    tdependence.append(pd.concat(nanpaddin+available, axis=0, ignore_index=True))    
pbar.close()

100%|██████████| 33515/33515 [01:08<00:00, 490.47it/s]


In [47]:
test2 = pd.concat(tdependence, axis=1).T

In [48]:
test2.columns = [x + "_" + str(y) for y in range(20) for x in test.columns]

In [49]:
test2[:5]

Unnamed: 0,Unnamed: 0_0,hash_0,trajectory_id_0,time_entry_0,time_exit_0,vmax_0,vmin_0,vmean_0,x_entry_0,y_entry_0,...,y_entry_19,x_exit_19,y_exit_19,duration_19,entry_hour_19,entry_minute_19,entry_second_19,exit_hour_19,exit_minute_19,exit_second_19
0,,,,,,,,,,,...,-19144900.0,,,420,15,3,32,15,10,32
1,,,,,,,,,,,...,-19341400.0,,,107,15,29,9,15,30,56
2,,,,,,,,,,,...,-19238600.0,,,0,15,26,8,15,26,8
3,,,,,,,,,,,...,-19355000.0,,,407,15,35,18,15,42,5
4,,,,,,,,,,,...,-19170100.0,,,667,14,54,7,15,5,14


In [50]:
test2.to_csv('data/interim/data_test_concat.csv')

In [66]:
test2 = pd.read_csv("data/interim/data_test_concat.csv", index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
test2.shape

(33515, 381)

In [68]:
tname = test2['trajectory_id_19']

In [69]:
tX = test2.drop([x+str(y) for y in range(20) for x in ['Unnamed: 0_', 'hash_', 'trajectory_id_', 'time_entry_', 'time_exit_']] \
    + ['x_exit_19', 'y_exit_19','vmax_19', 'vmin_19', 'vmean_19'] + ['Unnamed: 0'], axis=1)
# X = X.fillna(0)

In [70]:
(X.shape, tX.shape)

((134063, 275), (33515, 275))

In [71]:
pred_y = models2.predict(tX)

  if diff:


In [72]:
print(pred_y.sum(), pred_y.shape)

8358 (33515,)


In [73]:
output = pd.DataFrame({'id': tname, 'target': pred_y.astype(int)})
output.to_csv('data/submissions/submission6.csv', index=False, columns=['id', 'target'])