In [1]:
import fnmatch
import geopandas as gpd
import numpy as np
import pandas as pd
import skmob
from shapely.geometry import Point
from skmob.tessellation import tilers
from utils.config import Config
from zipfile import ZipFile

In [2]:
dataset_file = "data/BikeNYC/BikeNYC.zip"

## Reading the whole NYC Dataset

In [3]:
if dataset_file.endswith('.zip'):
    with ZipFile(dataset_file) as zipfiles:
        file_list = zipfiles.namelist()
        
        #get only the csv files
        csv_files = fnmatch.filter(file_list, "*.csv")
        
        #iterate with a list comprehension to get the individual dataframes
        data = [pd.read_csv(zipfiles.open(file_name)) for file_name in csv_files]
        df = pd.concat(data)
else:
    df = pd.read_csv(dataset_file, sep=',')

In [4]:
df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,558,2014-04-01 00:00:07,2014-04-01 00:09:25,82,St James Pl & Pearl St,40.711174,-74.000165,2008,Little West St & 1 Pl,40.705693,-74.016777,21062,Subscriber,1982,1
1,882,2014-04-01 00:00:20,2014-04-01 00:15:02,349,Rivington St & Ridge St,40.718502,-73.983299,312,Allen St & E Houston St,40.722055,-73.989111,20229,Subscriber,1988,1
2,587,2014-04-01 00:00:25,2014-04-01 00:10:12,293,Lafayette St & E 8 St,40.730287,-73.990765,334,W 20 St & 7 Ave,40.742388,-73.997262,20922,Subscriber,1959,1
3,355,2014-04-01 00:00:44,2014-04-01 00:06:39,539,Metropolitan Ave & Bedford Ave,40.715348,-73.960241,282,Kent Ave & S 11 St,40.708273,-73.968341,20914,Subscriber,1981,1
4,524,2014-04-01 00:01:29,2014-04-01 00:10:13,459,W 20 St & 11 Ave,40.746745,-74.007756,503,E 20 St & Park Ave,40.738274,-73.98752,21051,Subscriber,1964,1


In [5]:
len(df)

5359995

## Selecting relevant features

We can observe that a bike ending its route in a place, it will start the next journey in the same place. So it is not necessary to duplicate the record. Moreover we can drop the end station data.

In [6]:
df[df['bikeid']==21062]

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,558,2014-04-01 00:00:07,2014-04-01 00:09:25,82,St James Pl & Pearl St,40.711174,-74.000165,2008,Little West St & 1 Pl,40.705693,-74.016777,21062,Subscriber,1982,1
1625,398,2014-04-01 07:41:30,2014-04-01 07:48:08,2008,Little West St & 1 Pl,40.705693,-74.016777,224,Spruce St & Nassau St,40.711464,-74.005524,21062,Subscriber,1995,1
4167,228,2014-04-01 08:54:32,2014-04-01 08:58:20,224,Spruce St & Nassau St,40.711464,-74.005524,360,William St & Pine St,40.707179,-74.008873,21062,Subscriber,1967,1
34357,440,2014-04-02 15:25:04,2014-04-02 15:32:24,306,Cliff St & Fulton St,40.708235,-74.005301,147,Greenwich St & Warren St,40.715422,-74.011220,21062,Subscriber,1973,1
37430,955,2014-04-02 17:22:38,2014-04-02 17:38:33,147,Greenwich St & Warren St,40.715422,-74.011220,463,9 Ave & W 16 St,40.742065,-74.004432,21062,Subscriber,1990,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437355,1992,9/14/2014 18:28:55,9/14/2014 19:02:07,420,Clermont Ave & Lafayette Ave,40.687645,-73.969689,217,Old Fulton St,40.702772,-73.993836,21062,Customer,,0
438730,1568,9/14/2014 19:04:35,9/14/2014 19:30:43,217,Old Fulton St,40.702772,-73.993836,387,Centre St & Chambers St,40.712733,-74.004607,21062,Customer,,0
591390,383,9/19/2014 12:17:16,9/19/2014 12:23:39,307,Canal St & Rutgers St,40.714275,-73.989900,307,Canal St & Rutgers St,40.714275,-73.989900,21062,Subscriber,1988,1
591858,399,9/19/2014 12:34:38,9/19/2014 12:41:17,307,Canal St & Rutgers St,40.714275,-73.989900,295,Pike St & E Broadway,40.714067,-73.992939,21062,Subscriber,1997,1


In [7]:
relevant_features = ['starttime', 'start station latitude', 'start station longitude', 'bikeid']

In [8]:
df = df[relevant_features]

## Transforming to a Trajectory DataFrame

In [47]:
tile_size = 1500
sample_time = "60min"

In [48]:
tessellation = tilers.tiler.get("squared", base_shape="New York City, USA", meters=tile_size)

In [11]:
tdf = skmob.TrajDataFrame(df, latitude='start station latitude', longitude='start station longitude', datetime='starttime', user_id='bikeid')

In [12]:
tdf.head()

Unnamed: 0,datetime,lat,lng,uid
0,2014-04-01 00:00:07,40.711174,-74.000165,21062
1,2014-04-01 00:00:20,40.718502,-73.983299,20229
2,2014-04-01 00:00:25,40.730287,-73.990765,20922
3,2014-04-01 00:00:44,40.715348,-73.960241,20914
4,2014-04-01 00:01:29,40.746745,-74.007756,21051


In [13]:
tdf_mapped = tdf.mapping(tessellation)

In [14]:
tdf_mapped.head()

Unnamed: 0,datetime,lat,lng,uid,tile_ID
0,2014-04-01 00:00:07,40.711174,-74.000165,21062,12
0,2014-04-01 00:00:07,40.711174,-74.000165,21062,21
0,2014-04-01 00:00:07,40.711174,-74.000165,21062,7
0,2014-04-01 00:00:07,40.711174,-74.000165,21062,31
0,2014-04-01 00:00:07,40.711174,-74.000165,21062,15


In [38]:
fdf = tdf.to_flowdataframe(tessellation=tessellation)

In [49]:
fdf.head()

Unnamed: 0,origin,destination,flow
0,11,11,1877
1,11,12,5823
2,11,13,10927
3,11,14,13701
4,11,15,17447


In [46]:
m = fdf.plot_tessellation()
fdf.plot_flows(flow_color='red', map_f=m)

# Try using as base_shape New York City

In [50]:
tessellation = tilers.tiler.get("squared", base_shape="New York City, USA", meters=tile_size)

In [51]:
fdf = tdf.to_flowdataframe(tessellation=tessellation)

In [52]:
m = fdf.plot_tessellation()
fdf.plot_flows(flow_color='red', map_f=m)