In [1]:
import skmob
from skmob.utils.plot import plot_gdf
from skmob.tessellation import tilers
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)

In [2]:
def toAdj(fdf, tess):
    names = list(tess.tile_ID.values)
    adj = pd.DataFrame(columns=names, index = names)
    for i, row in enumerate(fdf.itertuples(), 1):
         adj.at[str(row.origin), str(row.destination)] = row.flow
    return adj

In [3]:
def filter_tessellation_land(tessellation, shape_file_land):    
    tiles_in_land = gpd.sjoin(tessellation, shape_file_land, how='left', op='intersects')
    tiles_in_land = tiles_in_land.groupby(['tile_ID'],sort=False,as_index=False).first()
    #land = tiles_in_land.dropna().drop(["index_right","boro_code","boro_name","shape_area","shape_leng"],axis=1)
    #water = tiles_in_land[tiles_in_land['index_right'].isnull()].drop(["index_right","boro_code","boro_name","shape_area","shape_leng"],axis=1)    
    
    land = tiles_in_land.dropna()[['tile_ID', 'geometry']]
    water = tiles_in_land[tiles_in_land['index_right'].isnull()][['tile_ID', 'geometry']]     
    
    crs = {'init': 'epsg:4326'}    
    land = gpd.GeoDataFrame(land, crs=crs, geometry='geometry')
    water = gpd.GeoDataFrame(water, crs=crs, geometry='geometry')     
    return {"land":land, "water":water}

In [7]:
fp = "./taxi_zones/taxi_zones.shp"
shape = gpd.read_file(fp)
shape = shape.to_crs("EPSG:4326")
shape = shape[shape['borough'] == 'Manhattan']
shape['centroid'] = shape['geometry'].centroid
shape['lon']  = shape['centroid'].x
shape['lat']  = shape['centroid'].y
shape.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry,centroid,lon,lat
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725...",POINT (-73.97697 40.72375),-73.976968,40.723752
11,12,0.036661,4.2e-05,Battery Park,12,Manhattan,"POLYGON ((-74.01566 40.70483, -74.01540 40.704...",POINT (-74.01556 40.70295),-74.015564,40.702946
12,13,0.050281,0.000149,Battery Park City,13,Manhattan,"POLYGON ((-74.01244 40.71906, -74.01282 40.717...",POINT (-74.01608 40.71204),-74.016079,40.712038
23,24,0.047,6.1e-05,Bloomingdale,24,Manhattan,"POLYGON ((-73.95954 40.79872, -73.96004 40.798...",POINT (-73.96548 40.80197),-73.96548,40.80197
40,41,0.052793,0.000143,Central Harlem,41,Manhattan,"POLYGON ((-73.94774 40.80960, -73.94506 40.808...",POINT (-73.95129 40.80433),-73.951292,40.804334


In [8]:
d_lon = dict(zip(shape['OBJECTID'],shape['lon']))
d_lat = dict(zip(shape['OBJECTID'],shape['lat']))
goodID =list(shape['OBJECTID'])

In [9]:
%%time
for i in range (1,13):
    print(i)
    for year in range (2018,2020):
        
        df = pd.read_csv('data/TaxiNYC/yellow_tripdata_' +str(year) + "-%.2d" % i +'.csv')        
        
        print("letto")
        
        df = df.query('PULocationID in @goodID')
        df = df.query('DOLocationID in @goodID')
        
        df = df[['VendorID', 'trip_distance', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']]
        df = df[df['trip_distance'] >=1]
        
        df['start_date'] =  pd.to_datetime(df['tpep_pickup_datetime']).dt.strftime('%Y-%m-%d')
        df['stop_date']  =  pd.to_datetime(df['tpep_dropoff_datetime']).dt.strftime('%Y-%m-%d')
        df = df[df['start_date'] == df['stop_date']]
        
        df['date'] = df['start_date']
        df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
        df = df[pd.to_datetime(df['date']).dt.year.isin([year])& 
                pd.to_datetime(df['date']).dt.month.isin([i]) ]
        print("checked")
        
    
        df['start_lat'] = df['PULocationID'].map(d_lat) 
        df['start_lon'] = df['PULocationID'].map(d_lon) 

        df['end_lat'] = df['DOLocationID'].map(d_lat) 
        df['end_lon'] = df['DOLocationID'].map(d_lon) 

        df = df[['date','start_lat', 'start_lon', 'end_lat', 'end_lon']]
        df.dropna(subset = ["start_lat", 'start_lon', 'end_lat', 'end_lon'], inplace=True)


        df_start = df[['date', 'start_lat', 'start_lon']]
        df_start['ind'] = df_start.index
        df_start['lat'] = df['start_lat']
        df_start['lon'] = df['start_lon']
        df_start = df_start[['date', 'lat', 'lon', 'ind']]

        df_end =  df[['date', 'end_lat', 'end_lon']]
        df_end['ind'] = df_end.index
        df_end['lat'] = df['end_lat']
        df_end['lon'] = df['end_lon']
        df_end = df_end[['date', 'lat', 'lon', 'ind']]
        
        result = pd.concat([df_start, df_end])
        result = result.sort_values(by=['ind'])

        data = result.groupby('date')

        for row,group in data:
            p = os.path.join('./Filtered/TaxiNYC', "{}.csv".format(row))
            group.to_csv(p, index=False)


1
letto
checked
letto
checked
2
letto
checked
letto
checked
3
letto
checked
letto
checked
4
letto
checked
letto
checked
5
letto
checked
letto
checked
6
letto
checked
letto
checked
7
letto
checked
letto
checked
8
letto
checked
letto
checked
9
letto
checked
letto
checked
10
letto
checked
letto
checked
11
letto
checked
letto
checked
12
letto
checked
letto
checked
Wall time: 1h 48min 21s


# From Tdf to Flow to AdjMatrix #

In [4]:
meters = 1840
tesselletion = tilers.tiler.get("squared", meters=meters, base_shape="New York City")
shape_file_land = gpd.read_file("NYC_shapeNoWaterArea.geojson")
shape_file_land_MAN = shape_file_land.iloc[[4]]
res_inter = filter_tessellation_land(tesselletion, shape_file_land_MAN )
tess_nyc = res_inter['land']

In [5]:
plot_gdf(tess_nyc, style_func_args={'fillColor':'gray', 'color':'black', 'opacity': 0.2},
        zoom = 12, popup_features=['tile_ID']) 

# Reshaping #

In [None]:
%%time
directory = './Filtered/TaxiNYC/'
i = 1
l = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        print(filename)
        base= os.path.basename(filename)
        name = os.path.splitext(base)[0]
        print(i)

        df = pd.read_csv(directory+filename)
        tdf = skmob.TrajDataFrame(df, latitude='lat', longitude='lon', 
                                   user_id='ind', datetime='date')
        fdf = tdf.to_flowdataframe(tess_nyc, self_loops=True)

        adj = toAdj(fdf, tess_nyc)
        adj = adj.fillna(0)
        arr = adj.to_numpy()

        np.save("../adj/TaxiNYC/" +name +'.npy', arr)    
        i += 1
