In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('augmented.csv')
df.head()

Unnamed: 0,geohash6,day,timestamp,demand,lat,lat_err,long,long_err,h,m,dow,latlong_pca0,latlong_pca1,latlongerror_pca0,latlongerror_pca1,latlong_cluster,latlongerror_cluster,time_delta,time_delta_sin,hour_sin
0,212,18,20:0,0.020072,-5.353088,0.002747,90.653687,0.005493,20,0,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.833333,0.682671,0.25
1,212,19,5:30,0.512506,-5.353088,0.002747,90.653687,0.005493,5,30,5,45.348186,0.012163,-45.45809,-1.4e-05,3,1,5.229167,0.509349,0.37059
2,212,20,12:15,0.66893,-5.353088,0.002747,90.653687,0.005493,12,15,6,45.348186,0.012163,-45.45809,-1.4e-05,3,1,6.510417,0.047507,1.0
3,212,59,21:0,0.047361,-5.353088,0.002747,90.653687,0.005493,21,0,3,45.348186,0.012163,-45.45809,-1.4e-05,3,1,3.875,0.971942,0.146447
4,212,60,10:45,1.0,-5.353088,0.002747,90.653687,0.005493,10,45,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.447917,0.829673,0.933013


In [3]:
X = df[['geohash6', 'day', 'lat', 'lat_err', 'long', 'long_err', 'h', 'm',
       'dow', 'latlong_pca0', 'latlong_pca1', 'latlongerror_pca0', 'latlongerror_pca1',
       'latlong_cluster', 'latlongerror_cluster', 'time_delta', 'time_delta_sin',
       'hour_sin']]
Y = df['demand']

In [4]:
X.head()

Unnamed: 0,geohash6,day,lat,lat_err,long,long_err,h,m,dow,latlong_pca0,latlong_pca1,latlongerror_pca0,latlongerror_pca1,latlong_cluster,latlongerror_cluster,time_delta,time_delta_sin,hour_sin
0,212,18,-5.353088,0.002747,90.653687,0.005493,20,0,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.833333,0.682671,0.25
1,212,19,-5.353088,0.002747,90.653687,0.005493,5,30,5,45.348186,0.012163,-45.45809,-1.4e-05,3,1,5.229167,0.509349,0.37059
2,212,20,-5.353088,0.002747,90.653687,0.005493,12,15,6,45.348186,0.012163,-45.45809,-1.4e-05,3,1,6.510417,0.047507,1.0
3,212,59,-5.353088,0.002747,90.653687,0.005493,21,0,3,45.348186,0.012163,-45.45809,-1.4e-05,3,1,3.875,0.971942,0.146447
4,212,60,-5.353088,0.002747,90.653687,0.005493,10,45,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.447917,0.829673,0.933013


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4206321 entries, 0 to 4206320
Data columns (total 18 columns):
geohash6                int64
day                     int64
lat                     float64
lat_err                 float64
long                    float64
long_err                float64
h                       int64
m                       int64
dow                     int64
latlong_pca0            float64
latlong_pca1            float64
latlongerror_pca0       float64
latlongerror_pca1       float64
latlong_cluster         int64
latlongerror_cluster    int64
time_delta              float64
time_delta_sin          float64
hour_sin                float64
dtypes: float64(11), int64(7)
memory usage: 577.7 MB


In [6]:
params = {
    'eta': 0.037,
    'max_depth': 5,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'silent': 1
}

In [7]:
Y_test = np.zeros(len(Y))

In [8]:
X = X.values
Y = Y.values

In [9]:
x = xgb.DMatrix(X)

In [10]:
for i, (train_ind, val_ind) in enumerate(KFold(n_splits=4, shuffle = True, 
                                               random_state=1989).split(X)):
    print('Training model %d' % i)
    
    d_train = xgb.DMatrix(X[train_ind], Y[train_ind]) 
    d_valid = xgb.DMatrix(X[val_ind], Y[val_ind]) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=100, 
                          verbose_eval=100)
    
    Y_test += clf.predict(x)

Training model 0
[0]	train-rmse:0.411761	valid-rmse:0.411872
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.114934	valid-rmse:0.114728
[200]	train-rmse:0.100339	valid-rmse:0.100126
[300]	train-rmse:0.090145	valid-rmse:0.089968
[400]	train-rmse:0.083396	valid-rmse:0.083226
[500]	train-rmse:0.07746	valid-rmse:0.077316
[600]	train-rmse:0.072902	valid-rmse:0.072772
[700]	train-rmse:0.06937	valid-rmse:0.069238
[800]	train-rmse:0.066691	valid-rmse:0.066549
[900]	train-rmse:0.064798	valid-rmse:0.06466
[999]	train-rmse:0.063099	valid-rmse:0.062963
Training model 1
[0]	train-rmse:0.411816	valid-rmse:0.411827
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.113471	valid-rmse:0.113545
[200]	train-rmse:0.099716	valid-rmse:0.099848
[300]	train-rmse:0.089531	valid-rmse:

In [11]:
Y_test /= 4

In [12]:
df_sub = pd.DataFrame({'demand': Y_test})

In [13]:
df_sub.to_csv('xgb.csv',index = False)