In [1]:
from shapely.geometry import Point, Polygon
from shapely import wkt
import skmob
from skmob.preprocessing import filtering, detection
from skmob.utils.plot import plot_gdf
from skmob.tessellation import tilers
import pandas as pd
import geopandas as gpd
import webbrowser
import numpy as np
import networkx as nx

pd.set_option('display.max_columns', 500)

In [2]:
meters = 500

# UTILITIES #

In [3]:
def wkt_loads(x):
    try:
        return wkt.loads(x)
    except Exception:
        return None

In [4]:
#REQUIRE a geodataframe with a column named 'geometry_source' and another named 'geometry_target'
#                   and with an unique row-identifier column named 'id'
#REQUIRE a tesselletion in OpenStreetMap format

def flow_from_gdf_in_tess(df_city, tess_city):
    gdf_1 = gpd.GeoDataFrame(df_city, crs="EPSG:4326", geometry=df_city['geometry_source']).drop(['geometry_target', 'geometry_source'],axis =1)
    
    res_merge_source = gpd.sjoin(tess_city, gdf_1, how='right', op='contains')
    res_merge_source.dropna(inplace=True)
    res_merge_source.reset_index(drop=True, inplace=True)
    
    print("res_merge_source")
    print(res_merge_source.head())
    
    gdf_2 = gpd.GeoDataFrame(df_city, crs="EPSG:4326", geometry=df_city['geometry_target']).drop(['geometry_target', 'geometry_source'],axis =1)

    res_merge_dest = gpd.sjoin(tess_city, gdf_2, how='right', op='contains')
    res_merge_dest.dropna(inplace=True)
    res_merge_dest.reset_index(drop=True,inplace=True)
    
    print("res_merge_dest")
    print(res_merge_dest.head())
    
    fdf = res_merge_dest.merge(res_merge_source, how='inner', on=['id'])
    fdf = fdf[['tile_ID_x','tile_ID_y']]
    
    print("fdf")
    print(fdf.head())
    
    flusso = fdf.groupby(["tile_ID_x", "tile_ID_y"]).size().reset_index(name="flow")
    flusso = flusso.rename(columns={'tile_ID_x': 'origin', 'tile_ID_y':'destination'})
    
    print(flusso.head())
    
    fdf_city = skmob.FlowDataFrame(data = flusso,tessellation=tess_city, tile_id='tile_ID', origin ='origin', 
                              destination = 'destination', flow = 'flow')
    
    return fdf_city
    

In [5]:
def to_Adj_Matrix(df):
    df_np = df.to_numpy().astype(int)
    n = max(max(origin, destination) for origin, destination, flow in df_np )
    
    matrix = np.zeros((n+1, n+1))
    for origin, destination, flow in df_np:
        matrix[origin][destination] = flow 
    return matrix

In [6]:
def pad_with_zeros(A, r, c):
   out = np.zeros((r, c))
   r_, c_ = np.shape(A)
   out[0:r_, 0:c_] = A
   return out

## NYC ##

In [7]:
tess_nyc = tilers.tiler.get("squared", meters=meters, base_shape="New York City, New York")

  return _prepare_from_string(" ".join(pjargs))


In [8]:
df_nyc = pd.read_csv('data/201802-citibike-tripdata.csv.zip')

In [9]:
df_nyc['id'] = df_nyc.apply(lambda x: hash(tuple(x)), axis = 1)

df_nyc.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,id
0,771,2018-02-01 00:14:16.4120,2018-02-01 00:27:08.2290,72,W 52 St & 11 Ave,40.767272,-73.993929,379,W 31 St & 7 Ave,40.749156,-73.9916,14536,Subscriber,1952,1,-2972913605051034564
1,264,2018-02-01 05:14:45.1790,2018-02-01 05:19:09.6860,72,W 52 St & 11 Ave,40.767272,-73.993929,478,11 Ave & W 41 St,40.760301,-73.998842,32820,Subscriber,1965,1,-6099263867002803392
2,819,2018-02-01 06:48:55.2290,2018-02-01 07:02:35.0290,72,W 52 St & 11 Ave,40.767272,-73.993929,405,Washington St & Gansevoort St,40.739323,-74.008119,16131,Subscriber,1968,1,-8094522964782754194
3,646,2018-02-01 07:12:50.1740,2018-02-01 07:23:36.5280,72,W 52 St & 11 Ave,40.767272,-73.993929,2006,Central Park S & 6 Ave,40.765909,-73.976342,20831,Subscriber,1990,2,2005487222453508178
4,1312,2018-02-01 07:46:48.8750,2018-02-01 08:08:41.5430,72,W 52 St & 11 Ave,40.767272,-73.993929,435,W 21 St & 6 Ave,40.74174,-73.994156,15899,Subscriber,1957,1,-1502784927976863954


In [10]:

df_nyc['geometry_source'] = [Point(xy) for xy in zip(df_nyc['start station longitude'],df_nyc['start station latitude'])]
df_nyc['geometry_target'] = [Point(xy) for xy in zip(df_nyc['end station longitude'],df_nyc['end station latitude'])]



columns = ['tripduration', 'start station name', 'start station latitude', 'start station longitude', 'end station name', 'end station latitude', 'end station longitude', 'usertype', 'birth year', 'gender', 'starttime', 'stoptime', 'bikeid']
df_nyc = df_nyc.drop(columns, axis = 1) 

df_nyc.head()


Unnamed: 0,start station id,end station id,id,geometry_source,geometry_target
0,72,379,-2972913605051034564,POINT (-73.99392888 40.76727216),POINT (-73.99160000000001 40.749156)
1,72,478,-6099263867002803392,POINT (-73.99392888 40.76727216),POINT (-73.99884222 40.76030096)
2,72,405,-8094522964782754194,POINT (-73.99392888 40.76727216),POINT (-74.00811899999999 40.739323)
3,72,2006,2005487222453508178,POINT (-73.99392888 40.76727216),POINT (-73.97634151 40.76590936)
4,72,435,-1502784927976863954,POINT (-73.99392888 40.76727216),POINT (-73.99415556 40.74173969)


In [11]:
fdf_nyc = flow_from_gdf_in_tess(df_nyc, tess_nyc)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326

  


res_merge_source
   index_left tile_ID  start station id  end station id                   id  \
0      2740.0    2740                72             379 -2972913605051034564   
1      2740.0    2740                72             478 -6099263867002803392   
2      2740.0    2740                72             405 -8094522964782754194   
3      2740.0    2740                72            2006  2005487222453508178   
4      2740.0    2740                72             435 -1502784927976863954   

                     geometry  
0  POINT (-73.99393 40.76727)  
1  POINT (-73.99393 40.76727)  
2  POINT (-73.99393 40.76727)  
3  POINT (-73.99393 40.76727)  
4  POINT (-73.99393 40.76727)  


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326



res_merge_dest
   index_left tile_ID  start station id  end station id                   id  \
0      2734.0    2734                72             379 -2972913605051034564   
1      2579.0    2579                72             478 -6099263867002803392   
2      2423.0    2423                72             405 -8094522964782754194   
3      2985.0    2985                72            2006  2005487222453508178   
4      2652.0    2652                72             435 -1502784927976863954   

                     geometry  
0  POINT (-73.99160 40.74916)  
1  POINT (-73.99884 40.76030)  
2  POINT (-74.00812 40.73932)  
3  POINT (-73.97634 40.76591)  
4  POINT (-73.99416 40.74174)  
fdf
  tile_ID_x tile_ID_y
0      2734      2740
1      2579      2740
2      2423      2740
3      2985      2740
4      2652      2740
  origin destination  flow
0   2254        2254    40
1   2271        2271    55
2   2271        2272    44
3   2271        2274    93
4   2271        2339    51


In [12]:
fdf_nyc.head()

Unnamed: 0,origin,destination,flow
0,2254,2254,40
1,2271,2271,55
2,2271,2272,44
3,2271,2274,93
4,2271,2339,51


In [13]:
fdf_nyc.to_csv("fdf_nyc_2.csv")

In [14]:
fdf_nyc_np = to_Adj_Matrix(fdf_nyc)

fdf_nyc_np

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 9.]])

In [15]:
fdf_nyc_np[2254,2254]

40.0

# PORTO #

In [16]:
tess_por = tilers.tiler.get("squared", meters=meters, base_shape="Porto, Área Metropolitana do Porto, North, Portugal")

  return _prepare_from_string(" ".join(pjargs))


In [17]:
df_por = pd.read_csv('data/porto_trajectories_all.csv')
df_por = df_por.drop_duplicates()


mask = df_por.trajectory_id.duplicated(keep=False)
prova = df_por[mask]
prova = df_por.drop_duplicates(subset=['trajectory_id'])
df_por = prova



df_por['geometry_source'] = df_por.source_point.apply(wkt_loads)
df_por['geometry_target'] = df_por.target_point.apply(wkt_loads)


In [18]:
df_por.head()

Unnamed: 0,taxi_id,trajectory_id,timestamp,source_point,target_point,geometry_source,geometry_target
0,20000589,1372636858620000589,2013-07-01 00:00:58,POINT(-8.618643 41.141412),POINT(-8.630838 41.154489),POINT (-8.618643 41.141412),POINT (-8.630838000000001 41.154489)
1,20000596,1372637303620000596,2013-07-01 00:08:23,POINT(-8.639847 41.159826),POINT(-8.66574 41.170671),POINT (-8.639847 41.159826),POINT (-8.66574 41.170671)
2,20000320,1372636951620000320,2013-07-01 00:02:31,POINT(-8.612964 41.140359),POINT(-8.61597 41.14053),POINT (-8.612964 41.140359),POINT (-8.615970000000001 41.14053)
3,20000520,1372636854620000520,2013-07-01 00:00:54,POINT(-8.574678 41.151951),POINT(-8.607996 41.142915),POINT (-8.574678 41.151951),POINT (-8.607996 41.142915)
4,20000337,1372637091620000337,2013-07-01 00:04:51,POINT(-8.645994 41.18049),POINT(-8.687268 41.178087),POINT (-8.645994 41.18049),POINT (-8.687268 41.178087)


In [19]:
df_por['id'] = df_por['trajectory_id']
df_por.head()

Unnamed: 0,taxi_id,trajectory_id,timestamp,source_point,target_point,geometry_source,geometry_target,id
0,20000589,1372636858620000589,2013-07-01 00:00:58,POINT(-8.618643 41.141412),POINT(-8.630838 41.154489),POINT (-8.618643 41.141412),POINT (-8.630838000000001 41.154489),1372636858620000589
1,20000596,1372637303620000596,2013-07-01 00:08:23,POINT(-8.639847 41.159826),POINT(-8.66574 41.170671),POINT (-8.639847 41.159826),POINT (-8.66574 41.170671),1372637303620000596
2,20000320,1372636951620000320,2013-07-01 00:02:31,POINT(-8.612964 41.140359),POINT(-8.61597 41.14053),POINT (-8.612964 41.140359),POINT (-8.615970000000001 41.14053),1372636951620000320
3,20000520,1372636854620000520,2013-07-01 00:00:54,POINT(-8.574678 41.151951),POINT(-8.607996 41.142915),POINT (-8.574678 41.151951),POINT (-8.607996 41.142915),1372636854620000520
4,20000337,1372637091620000337,2013-07-01 00:04:51,POINT(-8.645994 41.18049),POINT(-8.687268 41.178087),POINT (-8.645994 41.18049),POINT (-8.687268 41.178087),1372637091620000337


In [20]:
columns = ['taxi_id', 'trajectory_id', 'timestamp', 'source_point', 'target_point']
df_por = df_por.drop(columns, axis = 1) 
df_por.head()

Unnamed: 0,geometry_source,geometry_target,id
0,POINT (-8.618643 41.141412),POINT (-8.630838000000001 41.154489),1372636858620000589
1,POINT (-8.639847 41.159826),POINT (-8.66574 41.170671),1372637303620000596
2,POINT (-8.612964 41.140359),POINT (-8.615970000000001 41.14053),1372636951620000320
3,POINT (-8.574678 41.151951),POINT (-8.607996 41.142915),1372636854620000520
4,POINT (-8.645994 41.18049),POINT (-8.687268 41.178087),1372637091620000337


In [21]:
df_por.size

5022447

In [22]:
fdf_por = flow_from_gdf_in_tess(df_por, tess_por)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326

  


res_merge_source
   index_left tile_ID                   id                   geometry
0       170.0     170  1372636858620000589  POINT (-8.61864 41.14141)
1       114.0     114  1372637303620000596  POINT (-8.63985 41.15983)
2       184.0     184  1372636951620000320  POINT (-8.61296 41.14036)
3       298.0     298  1372636854620000520  POINT (-8.57468 41.15195)
4       108.0     108  1372637091620000337  POINT (-8.64599 41.18049)


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326



res_merge_dest
   index_left tile_ID                   id                   geometry
0       136.0     136  1372636858620000589  POINT (-8.63084 41.15449)
1        47.0      47  1372637303620000596  POINT (-8.66574 41.17067)
2       170.0     170  1372636951620000320  POINT (-8.61597 41.14053)
3       199.0     199  1372636854620000520  POINT (-8.60800 41.14292)
4       300.0     300  1372636965620000231  POINT (-8.57822 41.16072)
fdf
  tile_ID_x tile_ID_y
0       136       170
1        47       114
2       170       184
3       199       298
4       300       170
  origin destination  flow
0      0           1     1
1      0         114     5
2      0         134     1
3      0         136     1
4      0         149     1


In [23]:
fdf_por.head()

Unnamed: 0,origin,destination,flow
0,0,1,1
1,0,114,5
2,0,134,1
3,0,136,1
4,0,149,1


In [24]:
fdf_por.to_csv("fdf_por_2.csv")

In [25]:
fdf_por_np = to_Adj_Matrix(fdf_por)

fdf_por_np

array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0., 54.,  7., ...,  0.,  0.,  0.],
       [ 0.,  3.,  8., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [26]:
fdf_por_np[0,114]

5.0