In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import pickle
import skmob
from shapely.geometry import Point
from skmob.tessellation import tilers

In [4]:
CACHE = True
filename_df = 'NYC150060min_df.pkl'

In [5]:
with open(filename_df, "rb") as input_file:
    df = pickle.load(input_file)

In [6]:
df

Unnamed: 0,starttime,start station latitude,start station longitude,end station latitude,end station longitude,origin,destination
0,2014-04-01 00:00:07,40.711174,-74.000165,40.705693,-74.016777,12.0,5.0
1,2014-04-01 00:00:20,40.718502,-73.983299,40.722055,-73.989111,20.0,20.0
2,2014-04-01 00:00:25,40.730287,-73.990765,40.742388,-73.997262,21.0,15.0
3,2014-04-01 00:00:44,40.715348,-73.960241,40.708273,-73.968341,,28.0
4,2014-04-01 00:01:29,40.746745,-74.007756,40.738274,-73.987520,9.0,22.0
...,...,...,...,...,...,...,...
953882,9/30/2014 23:58:42,40.726795,-73.996951,40.720828,-73.977932,14.0,29.0
953883,9/30/2014 23:59:13,40.750450,-73.994811,40.730473,-73.986724,16.0,21.0
953884,9/30/2014 23:59:29,40.711174,-74.000165,40.705693,-74.016777,12.0,5.0
953885,9/30/2014 23:59:58,40.760193,-73.991255,40.751873,-73.977706,24.0,32.0


In [7]:
tessellation = tilers.tiler.get("squared", base_shape="Manhattan, New York City, USA", meters=1500)

In [10]:
list_features=["origin", "destination", "starttime"]
sample_time="60min"
flows=2

In [14]:
def get_xy_location(tessellation):
    """
    Given a tessellation with longitude and latitude coordinates, 
    returns a list of positions in a squared matrix for each square and its size
    """
    centroids = [el.centroid for el in tessellation['geometry']]

    x_list, y_list = [], []
    for el in tessellation['geometry']:
        x_list.append(el.centroid.x)
        y_list.append(el.centroid.y)
    x_list = sorted(list(set(x_list)))
    y_list = sorted(list(set(y_list)))

    x_dict = {coordinate:val for val, coordinate in enumerate(x_list)}
    y_dict = {coordinate:val for val, coordinate in enumerate(y_list)}

    return [(x_dict[centroid.x], y_dict[centroid.y]) for centroid in centroids], len(x_list), len(y_list)

def get_xy_map(df, m_shape):
    """
    Given a dataframe and a matrix shape
    Returns a matrix with given sizes and the timestamp for each record of the dataframe, corresponding to the given DataFrame
    """
    X = np.zeros(m_shape)
    time_samples = set()
    t = -1
    for metadata, flow in df.iterrows():
        if metadata[0] not in time_samples:
            time_samples.add(metadata[0])
            t += 1
        x = metadata[1][0]
        y = metadata[1][1]
        X[t][int(x)][int(y)] = flow['flow']
    return X, time_samples

In [11]:
df.dropna(inplace=True)
fdf = df[list_features]
fdf['origin'] = fdf['origin'].astype(int)
fdf['destination'] = fdf['destination'].astype(int)
fdf['flow'] = 1

fdf = skmob.FlowDataFrame(fdf,
                        tessellation=tessellation,
                        origin="origin",
                        destination="destination",
                        tile_id='tile_ID')

flow_df = fdf[['starttime','origin','destination', 'flow']]
flow_df['starttime'] = pd.to_datetime(flow_df.starttime)
flow_df['origin'] = flow_df['origin'].astype(int)
flow_df['destination'] = flow_df['destination'].astype(int)

In [13]:
flow_df

Unnamed: 0,starttime,origin,destination,flow
0,2014-04-01 00:00:07,12,5,1
1,2014-04-01 00:00:20,20,20,1
2,2014-04-01 00:00:25,21,15,1
4,2014-04-01 00:01:29,9,22,1
5,2014-04-01 00:01:53,33,24,1
...,...,...,...,...
953882,2014-09-30 23:58:42,14,29,1
953883,2014-09-30 23:59:13,16,21,1
953884,2014-09-30 23:59:29,12,5,1
953885,2014-09-30 23:59:58,24,32,1


In [15]:
print("Getting xy location")
tessellation['map_point'], x_size, y_size = get_xy_location(tessellation)

# Create a dict to map each tile_ID to a point of the xy_map
tile_to_xy = {key:val for (key,val) in zip(tessellation['tile_ID'], tessellation['map_point'])}

# Transform the origin and the destination from tile_IDs to position of the xy_map
flow_df['origin'] = [tile_to_xy[str(el)] for el in flow_df['origin']]
flow_df['destination'] = [tile_to_xy[str(el)] for el in flow_df['destination']]

n_timestamps = flow_df.groupby(pd.Grouper(key='starttime', freq=sample_time)).ngroups

f_out = flow_df.groupby([pd.Grouper(key='starttime', freq=sample_time),'origin']).sum()
f_in = flow_df.groupby([pd.Grouper(key='starttime', freq=sample_time),'destination']).sum()

Getting xy location


In [19]:
fdf.plot_tessellation(popup_features=['origin', 'flow'])

In [22]:
# Filling numpy array 
X_dataset = np.empty([n_timestamps, x_size, y_size, flows])

print("Getting xy map")
X_dataset[:,:,:,0], _ = get_xy_map(f_in, [n_timestamps, x_size, y_size]) # Inflow
X_dataset[:,:,:,1], time_samples = get_xy_map(f_out, [n_timestamps, x_size, y_size]) # Outflow

time_samples = list(time_samples)

# Adapting the time_samples list
time_string = [str(int(str(time_sample).replace("-", "").replace(" ","").replace(":", "")[:10])+1).encode('utf-8') for time_sample in time_samples]

Getting xy map


In [26]:
X_dataset[0].shape

(21, 28, 2)

In [139]:
flows = 1

In [140]:
X_dataset.shape

(4392, 21, 28, 2)

In [152]:
def remove_empty_rows(X_dataset, flows):
    X_new = []
    X_sum = []
    for i in range(flows):
        X_new.append(X_dataset[:,:,:,i])
        X_sum.append(np.add.reduce(X_new[i]))

        X_new[i] = X_new[i][:,~(X_sum[i]==0).all(1)]    # Removing empty rows
        X_new[i] = X_new[i][:,:,~(X_sum[i].T==0).all(1)]    # Removing empty columns

    X_dataset = np.empty([X_dataset.shape[0], X_new[0].shape[1], X_new[0].shape[2], flows])

    for i in range(flows):
        X_dataset[:,:,:,i] = X_new[i]

    return X_dataset

In [153]:
X = remove_empty_rows(X_dataset, 2)

In [126]:
X_in = X_dataset[:,:,:,0]
X_out = X_dataset[:,:,:,1]

In [127]:
X_in_sum = np.add.reduce(X_in)
X_out_sum = np.add.reduce(X_out)

In [128]:
X_in = X_in[:,~(X_in_sum==0).all(1)]
X_in = X_in[:,:,~(X_in_sum.T==0).all(1)]


In [131]:
X_in.shape[1]

8

In [123]:
X_out = X_out[:,~(X_out_sum==0).all(1)]
X_out = X_out[:,:,~(X_out_sum.T==0).all(1)]

In [124]:
X_in_sum = np.add.reduce(X_in)
X_out_sum = np.add.reduce(X_out)

In [125]:
print(X_in_sum.shape)
print(X_out_sum.shape)


(8, 13)
(8, 13)


In [116]:
X_in_sum

array([[ 49659.,      0., 188026.,      0., 209318.,  74156.,      0.,
             0.,      0.,      0.,  30708.,      0.,      0.],
       [     0.,      0.,      0.,      0.,      0.,      0.,      0.,
         79409.,      0.,      0.,      0.,      0.,      0.],
       [     0.,  28935.,      0., 135835., 268121.,      0., 331398.,
             0., 442033., 267148.,      0., 146850.,  17371.],
       [     0.,      0.,      0.,  66080., 259889.,      0., 461431.,
             0., 371599., 318260.,      0., 252332., 135841.],
       [     0.,      0.,   9435.,      0.,  29237.,  55222.,      0.,
        170076.,      0.,      0.,      0., 165451.,  26345.],
       [     0.,      0.,      0.,      0.,      0.,      0.,      0.,
             0.,      0.,      0., 235388.,      0.,      0.],
       [     0.,      0.,      0.,      0.,      0.,      0.,      0.,
             0.,      0.,      0.,      0.,  35200.,      0.],
       [     0.,      0.,      0.,      0.,      0.,      0., 

In [117]:
X_in_sum.T[~(X_in_sum.T==0).all(1)]

array([[ 49659.,      0.,      0.,      0.,      0.,      0.,      0.,
             0.],
       [     0.,      0.,  28935.,      0.,      0.,      0.,      0.,
             0.],
       [188026.,      0.,      0.,      0.,   9435.,      0.,      0.,
             0.],
       [     0.,      0., 135835.,  66080.,      0.,      0.,      0.,
             0.],
       [209318.,      0., 268121., 259889.,  29237.,      0.,      0.,
             0.],
       [ 74156.,      0.,      0.,      0.,  55222.,      0.,      0.,
             0.],
       [     0.,      0., 331398., 461431.,      0.,      0.,      0.,
             0.],
       [     0.,  79409.,      0.,      0., 170076.,      0.,      0.,
             0.],
       [     0.,      0., 442033., 371599.,      0.,      0.,      0.,
             0.],
       [     0.,      0., 267148., 318260.,      0.,      0.,      0.,
          9067.],
       [ 30708.,      0.,      0.,      0.,      0., 235388.,      0.,
             0.],
       [     0.,     

In [45]:
print(X_in_sum.T == 0)

[[ True  True  True  True  True  True  True  True  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True  True  True  True  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True False  True  True  True  True  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True False  True  True  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True False  True  True  True False  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True  True  True False False  True  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True False  True False False False  True  True  True  True
   True  True  True  True  True  True  True  True  True]
 [ True  True  True False  True  True  True False  True  True  True  True
   True  

In [101]:
X_in_sum.shape

(8, 28)

In [38]:
for row in X_in_sum:
    print(np.all(row == 0))

True
True
True
False
False
False
False
False
False
True
False
False
True
True
True
True
True
True
True
True
True


In [42]:
for row in X_in_sum.T:
    print(np.all(row == 0))

True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
True
True
True
True
True
True
True
True
True
True
