## Contribution

Some introduction to the dataset, you might want to checkup [starting kit from Mahadir Ahmad](https://www.kaggle.com/mahadir/grab-traffic-demand-forecasting-starting-kit), it is really perfect.

## Library required

In [1]:
!pip3 install Geohash python-geohash pandas numpy

[33mYou are using pip version 18.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import Geohash
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Traffic Management/training.csv")

## Copy pasting from [starting kit from Mahadir Ahmad](https://www.kaggle.com/mahadir/grab-traffic-demand-forecasting-starting-kit) for preprocessing

In [4]:
geohashes_df = df.groupby('geohash6', as_index=False)\
.agg({'day':'count'})\
.rename(columns={'day':'count'})\
.sort_values(by='count', ascending=False)

In [5]:
geohashes_df['lat'] = None
geohashes_df['lat_err'] = None
geohashes_df['long'] = None
geohashes_df['long_err'] = None
for i in range(len(geohashes_df)):
    geo_decoded = Geohash.decode_exactly(geohashes_df.loc[i,'geohash6'])
    geohashes_df.loc[i,'lat'] = geo_decoded[0]
    geohashes_df.loc[i,'long'] = geo_decoded[1]
    geohashes_df.loc[i,'lat_err'] = geo_decoded[2]
    geohashes_df.loc[i,'long_err'] = geo_decoded[3]

In [6]:
df = df.merge(geohashes_df.drop(columns=['count']), on='geohash6', how='inner')

In [7]:
df['lat'] = df['lat'].astype('float64')
df['long'] = df['long'].astype('float64')
df['lat_err'] = df['lat_err'].astype('float64')
df['long_err'] = df['long_err'].astype('float64')

In [8]:
df.head()

Unnamed: 0,geohash6,day,timestamp,demand,lat,lat_err,long,long_err
0,qp03wc,18,20:0,0.020072,-5.353088,0.002747,90.653687,0.005493
1,qp03wc,19,5:30,0.512506,-5.353088,0.002747,90.653687,0.005493
2,qp03wc,20,12:15,0.66893,-5.353088,0.002747,90.653687,0.005493
3,qp03wc,59,21:0,0.047361,-5.353088,0.002747,90.653687,0.005493
4,qp03wc,60,10:45,1.0,-5.353088,0.002747,90.653687,0.005493


In [9]:
df[['h','m']] = df['timestamp'].str.split(':',expand=True)
df['h'] = df['h'].astype('int64')
df['m'] = df['m'].astype('int64')
df['dow'] = df['day'] % 7

## Feature extraction

In [10]:
from sklearn.decomposition import PCA

#### Find projection based on latlong, latlong error transformation

In [11]:
X = np.vstack((df[['lat', 'long']],
               df[['lat_err', 'long_err']]))
pca = PCA().fit(X)
X_pca = pca.transform(X)

In [12]:
df['latlong_pca0'] = pca.transform(df[['lat', 'long']])[:,0]
df['latlong_pca1'] = pca.transform(df[['lat', 'long']])[:,1]

df['latlongerror_pca0'] = pca.transform(df[['lat_err', 'long_err']])[:,0]
df['latlongerror_pca1'] = pca.transform(df[['lat_err', 'long_err']])[:,1]

In [13]:
df.head()

Unnamed: 0,geohash6,day,timestamp,demand,lat,lat_err,long,long_err,h,m,dow,latlong_pca0,latlong_pca1,latlongerror_pca0,latlongerror_pca1
0,qp03wc,18,20:0,0.020072,-5.353088,0.002747,90.653687,0.005493,20,0,4,45.348186,0.012163,-45.45809,-1.4e-05
1,qp03wc,19,5:30,0.512506,-5.353088,0.002747,90.653687,0.005493,5,30,5,45.348186,0.012163,-45.45809,-1.4e-05
2,qp03wc,20,12:15,0.66893,-5.353088,0.002747,90.653687,0.005493,12,15,6,45.348186,0.012163,-45.45809,-1.4e-05
3,qp03wc,59,21:0,0.047361,-5.353088,0.002747,90.653687,0.005493,21,0,3,45.348186,0.012163,-45.45809,-1.4e-05
4,qp03wc,60,10:45,1.0,-5.353088,0.002747,90.653687,0.005493,10,45,4,45.348186,0.012163,-45.45809,-1.4e-05


#### Some salt using kmeans

In [14]:
from sklearn.cluster import MiniBatchKMeans

In [15]:
kmeans = MiniBatchKMeans(n_clusters=8**2, batch_size=32**3).fit(X)

In [16]:
df['latlong_cluster'] = kmeans.predict(df[['lat', 'long']])
df['latlongerror_cluster'] = kmeans.predict(df[['lat_err', 'long_err']])

In [17]:
df.head()

Unnamed: 0,geohash6,day,timestamp,demand,lat,lat_err,long,long_err,h,m,dow,latlong_pca0,latlong_pca1,latlongerror_pca0,latlongerror_pca1,latlong_cluster,latlongerror_cluster
0,qp03wc,18,20:0,0.020072,-5.353088,0.002747,90.653687,0.005493,20,0,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1
1,qp03wc,19,5:30,0.512506,-5.353088,0.002747,90.653687,0.005493,5,30,5,45.348186,0.012163,-45.45809,-1.4e-05,3,1
2,qp03wc,20,12:15,0.66893,-5.353088,0.002747,90.653687,0.005493,12,15,6,45.348186,0.012163,-45.45809,-1.4e-05,3,1
3,qp03wc,59,21:0,0.047361,-5.353088,0.002747,90.653687,0.005493,21,0,3,45.348186,0.012163,-45.45809,-1.4e-05,3,1
4,qp03wc,60,10:45,1.0,-5.353088,0.002747,90.653687,0.005493,10,45,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1


In [18]:
df['time_delta'] = df['dow'] + ((df['h'] + (df['m'] / 60.0)) / 24.0)

In [19]:
df['time_delta_sin'] = np.sin((df['time_delta'] / 7) * np.pi)**2
df['hour_sin'] = np.sin((df['h'] / 24) * np.pi)**2

In [20]:
from sklearn.preprocessing import LabelEncoder

df['geohash6'] = LabelEncoder().fit_transform(df['geohash6'])

In [21]:
df.head()

Unnamed: 0,geohash6,day,timestamp,demand,lat,lat_err,long,long_err,h,m,dow,latlong_pca0,latlong_pca1,latlongerror_pca0,latlongerror_pca1,latlong_cluster,latlongerror_cluster,time_delta,time_delta_sin,hour_sin
0,212,18,20:0,0.020072,-5.353088,0.002747,90.653687,0.005493,20,0,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.833333,0.682671,0.25
1,212,19,5:30,0.512506,-5.353088,0.002747,90.653687,0.005493,5,30,5,45.348186,0.012163,-45.45809,-1.4e-05,3,1,5.229167,0.509349,0.37059
2,212,20,12:15,0.66893,-5.353088,0.002747,90.653687,0.005493,12,15,6,45.348186,0.012163,-45.45809,-1.4e-05,3,1,6.510417,0.047507,1.0
3,212,59,21:0,0.047361,-5.353088,0.002747,90.653687,0.005493,21,0,3,45.348186,0.012163,-45.45809,-1.4e-05,3,1,3.875,0.971942,0.146447
4,212,60,10:45,1.0,-5.353088,0.002747,90.653687,0.005493,10,45,4,45.348186,0.012163,-45.45809,-1.4e-05,3,1,4.447917,0.829673,0.933013


In [22]:
df.to_csv('augmented.csv',index = False)