For a given time $t$, a need a prediction of horizon $T$ that describes all departures as `FlowsIn` and all arrivals as `FlowsOut`. Where `FlowsIn[t+h,i]` and `FlowsOut[t+h,i]` are the arrivals and departures $h$ time steps ahead of $t$ at station $i$.

The plan will be as follows, for each time bucket of 5 min the prediction includes all trips after time $t$.

In [1]:
import math
import numpy as np
import pandas as pd
import os
import tempfile

from scipy.spatial.distance import pdist, squareform
import scipy.io as sio

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
orderdf = pd.read_csv('ignored_assets/orders.csv', parse_dates = [6])

In [65]:
# Create map of driver IDs
driver_ids=orderdf['driver_id'].unique()

# create map of core stations
core_stations = orderdf['start_district_hash'].unique()
core_stations.sort()

In [5]:
#Index stuff
orderdf_f=orderdf[(~pd.isnull(orderdf['driver_id'])) & orderdf['dest_district_hash'].isin(core_stations)]
myindex=pd.MultiIndex.from_arrays([orderdf_f['driver_id'],orderdf_f['timestamp']],names=('driver_id','timestamp'))

orderdf2=orderdf_f.set_index(myindex)
orderdf_s=orderdf2.sort_index()

In [34]:
orderdf_s['time_bucket'] = orderdf_s['timestamp'].dt.round('5min')
orderdf_s['pax_arrival_time'] = orderdf_s['timestamp'] + orderdf_s['expected_travel_time'] * pd.Timedelta('1 minute')
orderdf_s['time_bucket_arrival'] = orderdf_s['pax_arrival_time'].dt.round('5min')

In [19]:
stepsize = pd.Timedelta('5 min')
horizon = pd.Timedelta('2 hour')
start = pd.to_datetime('2016-01-21 00:00:00')

In [28]:
nsteps = int(pd.Timedelta('1 day') / stepsize)

In [87]:
flowsout = np.zeros((nsteps,), dtype=np.object)
flowsin = np.zeros((nsteps,), dtype=np.object)
columns = core_stations
for i in range(nsteps):
    begin = start + i*stepsize
    end = begin + horizon
    index = [begin + j*stepsize for j in range(int(horizon / stepsize))]
    base = pd.DataFrame(
        np.zeros((len(index),len(columns)),dtype=np.float64),
        index=index,
        columns = columns
    )
    mask = (orderdf_s['time_bucket'] > begin) & (orderdf_s['time_bucket'] <= end)
    print np.sum(mask)
    flowsout[i] = (base + pd.pivot_table(orderdf_s[mask], index='time_bucket', 
               columns=['start_district_hash'], aggfunc=len, 
               values='order_id', fill_value=0)).fillna(0).as_matrix()
    flowsin[i] = (base + pd.pivot_table(orderdf_s[mask], index='time_bucket_arrival', 
               columns=['dest_district_hash'], aggfunc=len, 
               values='order_id', fill_value=0)).fillna(0).as_matrix()

6488
6164
5911
5632
5388
5150
4927
4698
4519
4327
4134
3963
3799
3639
3478
3326
3169
3012
2885
2781
2661
2535
2442
2353
2253
2172
2079
1992
1911
1860
1798
1743
1675
1621
1578
1529
1479
1444
1398
1366
1345
1318
1291
1245
1243
1249
1219
1205
1205
1193
1192
1218
1233
1258
1416
1735
2042
2429
2858
3375
4063
4965
5975
7106
8483
9983
11747
13614
15572
17429
19636
21775
23986
26467
28822
31237
33807
36423
38937
41234
43603
45695
47860
49839
51575
53118
54561
55605
56314
56749
56725
56486
56141
55840
55155
54358
53446
52329
51235
49963
48575
47124
45601
44135
42634
41267
39772
38357
37035
35725
34398
33337
32336
31625
31163
30723
30286
29947
29664
29535
29422
29358
29364
29400
29513
29562
29659
29770
29805
29949
30137
30418
30691
30830
31028
31311
31582
31742
31845
32039
32167
32239
32365
32505
32590
32610
32587
32610
32537
32607
32599
32592
32683
32646
32603
32475
32406
32475
32534
32536
32563
32543
32626
32599
32686
32676
32651
32603
32806
33017
33303
33633
33934
34252
34717
35310
35819
3640

In [96]:
predictions  ={
        'FlowsIn': flowsin,
        'FlowsOut': flowsout
    }

In [97]:
sio.savemat('perfect_predictions.mat',predictions)

In [85]:
(base + pd.pivot_table(orderdf_s[mask], index='time_bucket_arrival', 
               columns=['dest_district_hash'], aggfunc=len, 
               values='order_id', fill_value=0)).fillna(0).as_matrix()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  4.,  0.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [95]:
flowsin[100].shape

(42, 66)