In [1]:
import skmob
from skmob.utils.plot import plot_gdf
from skmob.tessellation import tilers
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)

In [2]:
def toAdj(fdf, tess):
    names = list(tess.tile_ID.values)
    adj = pd.DataFrame(columns=names, index = names)
    for i, row in enumerate(fdf.itertuples(), 1):
         adj.at[str(row.origin), str(row.destination)] = row.flow
    return adj

In [3]:
def filter_tessellation_land(tessellation, shape_file_land):    
    tiles_in_land = gpd.sjoin(tessellation, shape_file_land, how='left', op='intersects')
    tiles_in_land = tiles_in_land.groupby(['tile_ID'],sort=False,as_index=False).first()    
    land = tiles_in_land.dropna()[['tile_ID', 'geometry']]
    water = tiles_in_land[tiles_in_land['index_right'].isnull()][['tile_ID', 'geometry']]     
    
    crs = {'init': 'epsg:4326'}    
    land = gpd.GeoDataFrame(land, crs=crs, geometry='geometry')
    water = gpd.GeoDataFrame(water, crs=crs, geometry='geometry')     
    return {"land":land, "water":water}

In [4]:
%%time
for i in range (1,13):
    print(i)
    for year in range (2018,2020):
        df = pd.read_csv('./data/BikeNYC/' +str(year) + "%.2d" % i +'-citibike-tripdata.csv')
        df['start_date'] =  pd.to_datetime(df['starttime']).dt.strftime('%Y-%m-%d')
        df['stop_date'] =  pd.to_datetime(df['stoptime']).dt.strftime('%Y-%m-%d')
        df = df[df['start_date'] == df['stop_date']]

        df['date'] = df['start_date']
        df = df[['date','start station latitude', 'start station longitude', 'end station latitude', 'end station longitude']]

        df_start = df[['date', 'start station latitude', 'start station longitude']]
        df_start['ind'] = df_start.index
        df_start['lat'] = df['start station latitude']
        df_start['lon'] = df['start station longitude']
        df_start = df_start[['date', 'lat', 'lon', 'ind']]


        df_end =  df[['date', 'end station latitude', 'end station longitude']]
        df_end['ind'] = df_end.index
        df_end['lat'] = df['end station latitude']
        df_end['lon'] = df['end station longitude']
        df_end = df_end[['date', 'lat', 'lon', 'ind']]

        result = pd.concat([df_start, df_end])
        result = result.sort_values(by=['ind'])

        data = result.groupby('date')

        for row,group in data:
            p = os.path.join('./Filtered/BikeNYC', "{}.csv".format(row))
            group.to_csv(p, index=False)

1
2
3
4
5
6
7
8
9
10
11
12
Wall time: 22min 18s


# From Tdf to Flow to AdjMatrix #

In [5]:
meters = 1840
tesselletion = tilers.tiler.get("squared", meters=meters, base_shape="New York City")

shape_file_land = gpd.read_file("NYC_shapeNoWaterArea.geojson")

shape_file_land_MAN = shape_file_land.iloc[[4]]

res_inter = filter_tessellation_land(tesselletion, shape_file_land_MAN )
tess_nyc = res_inter['land']

In [6]:
plot_gdf(tess_nyc, style_func_args={'fillColor':'gray', 'color':'black', 'opacity': 0.2}, zoom = 9) 

# Reshaping #

In [None]:
%%time
directory = './Filtered/BikeNYC/'
i = 1
l = []

for filename in os.listdir(directory):
    
    base= os.path.basename(filename)
    name = os.path.splitext(base)[0]
    print(i)
    
    df = pd.read_csv(directory+filename)
    tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', 
                               user_id='ind', datetime='date')
    fdf = tdf.to_flowdataframe(tess_nyc, self_loops=True)

    adj = toAdj(fdf, tess_nyc)
    adj = adj.fillna(0)
    arr = adj.to_numpy()

    np.save("../adj/BikeNYC/" +name +'.npy', arr)    
    i += 1
