In [6]:
import pandas as pd
import geopandas as gpd

# Load bixi stations
bixi_stations_gdf = gpd.read_file('../data//curated/bixi-stations.json')

# Filter stations that are within the city area
bixi_stations_gdf = bixi_stations_gdf[bixi_stations_gdf['WITHIN_CITY_AREA']]

# Load bixi data
bixi = pd.read_csv('../data/raw/bixi/bixi-2024.csv')

# Filter bixi data to only include stations that are within the city area
bixi = bixi[bixi['STARTSTATIONNAME'].isin(bixi_stations_gdf['STATIONNAME'])]
bixi = bixi[bixi['ENDSTATIONNAME'].isin(bixi_stations_gdf['STATIONNAME'])]

In [9]:
# Transform STARTTIMEMS	and ENDTIMEMS into datetime
bixi['STARTTIMEMS'] = pd.to_datetime(bixi['STARTTIMEMS'], unit='ms')
bixi['ENDTIMEMS'] = pd.to_datetime(bixi['ENDTIMEMS'], unit='ms')

bixi[['STARTTIMEMS', 'ENDTIMEMS']].head()

# Get hour of day
bixi['STARTHOUR'] = bixi['STARTTIMEMS'].dt.hour
bixi['ENDHOUR'] = bixi['ENDTIMEMS'].dt.hour

bixi[['STARTHOUR', 'ENDHOUR']].head()

Unnamed: 0,STARTHOUR,ENDHOUR
0,21,21
1,22,22
2,23,23
3,21,22
4,1,1


In [10]:
# Group bixi data by 10 minutes intervals
bixi['STARTTIME10'] = bixi['STARTTIMEMS'].dt.floor('10min').dt.time
bixi['ENDTIME10'] = bixi['ENDTIMEMS'].dt.floor('10min').dt.time

bixi[['STARTTIME10', 'ENDTIME10']].head()

Unnamed: 0,STARTTIME10,ENDTIME10
0,21:20:00,21:30:00
1,22:00:00,22:00:00
2,23:20:00,23:40:00
3,21:50:00,22:00:00
4,01:00:00,01:10:00


In [17]:
# Group by station name and start time
# Retrieve unique start locations and keep first combination of coordinates
stations = bixi[['STARTSTATIONNAME']].drop_duplicates().rename(columns={'STARTSTATIONNAME': 'STATIONNAME'})

# generate 10 minute interval between 0 and 24 hours
time_intervals = pd.date_range('2024-01-01', periods=144, freq='10min').time

# Create a dataframe with all possible combinations of stations and time intervals
stations_10min = pd.DataFrame()
for time in time_intervals:
    stations['TIME'] = time
    stations_10min = pd.concat([stations_10min, stations])

# Sort by station name and time
stations_10min = stations_10min.sort_values(['STATIONNAME', 'TIME']).reset_index(drop=True)

stations_10min.head()

Unnamed: 0,STATIONNAME,TIME
0,10e avenue / Masson,00:00:00
1,10e avenue / Masson,00:10:00
2,10e avenue / Masson,00:20:00
3,10e avenue / Masson,00:30:00
4,10e avenue / Masson,00:40:00


In [15]:
# Add station coordinates to the dataframe
stations_10min = stations_10min.merge(bixi_stations_gdf[['STATIONNAME', 'STATIONLATITUDE', 'STATIONLONGITUDE', 'geometry']], on='STATIONNAME', how='left')

stations_10min.head()

Unnamed: 0,STATIONNAME,TIME,STATIONLATITUDE,STATIONLONGITUDE,geometry
0,10e avenue / Masson,00:00:00,45.550377,-73.573734,POINT (-8190190.557 5708587.191)
1,10e avenue / Masson,00:10:00,45.550377,-73.573734,POINT (-8190190.557 5708587.191)
2,10e avenue / Masson,00:20:00,45.550377,-73.573734,POINT (-8190190.557 5708587.191)
3,10e avenue / Masson,00:30:00,45.550377,-73.573734,POINT (-8190190.557 5708587.191)
4,10e avenue / Masson,00:40:00,45.550377,-73.573734,POINT (-8190190.557 5708587.191)


In [18]:
# Add start and end counts to the stations_10min dataframe
start_counts = bixi.groupby(['STARTSTATIONNAME', 'STARTTIME10']).size().reset_index(name='STARTCOUNT')
end_counts = bixi.groupby(['ENDSTATIONNAME', 'ENDTIME10']).size().reset_index(name='ENDCOUNT')

stations_10min = stations_10min.merge(start_counts, left_on=['STATIONNAME', 'TIME'], right_on=['STARTSTATIONNAME', 'STARTTIME10'], how='left')
stations_10min = stations_10min.merge(end_counts, left_on=['STATIONNAME', 'TIME'], right_on=['ENDSTATIONNAME', 'ENDTIME10'], how='left')

# Fill NaN values with 0
stations_10min = stations_10min.fillna(0)

# Delete rows with invalid station names
stations_10min = stations_10min[stations_10min['STATIONNAME'] != 0]

stations_10min.head()

Unnamed: 0,STATIONNAME,TIME,STARTSTATIONNAME,STARTTIME10,STARTCOUNT,ENDSTATIONNAME,ENDTIME10,ENDCOUNT
0,10e avenue / Masson,00:00:00,10e avenue / Masson,00:00:00,259.0,10e avenue / Masson,00:00:00,289.0
1,10e avenue / Masson,00:10:00,10e avenue / Masson,00:10:00,255.0,10e avenue / Masson,00:10:00,257.0
2,10e avenue / Masson,00:20:00,10e avenue / Masson,00:20:00,218.0,10e avenue / Masson,00:20:00,231.0
3,10e avenue / Masson,00:30:00,10e avenue / Masson,00:30:00,241.0,10e avenue / Masson,00:30:00,223.0
4,10e avenue / Masson,00:40:00,10e avenue / Masson,00:40:00,213.0,10e avenue / Masson,00:40:00,210.0


In [19]:
# Keep relevant columns and add a geometry column
stations_10min = stations_10min[['STATIONNAME', 'LATITUDE', 'LONGITUDE', 'TIME', 'STARTCOUNT', 'ENDCOUNT']]
stations_10min = gpd.GeoDataFrame(
    stations_10min,
    geometry=gpd.points_from_xy(stations_10min['LONGITUDE'], stations_10min['LATITUDE']),
    crs='EPSG:4326'
)

stations_10min.head()

KeyError: "['LATITUDE', 'LONGITUDE'] not in index"