In [3]:
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt

## Load in 2023 and 2024 Data


In [4]:
data_23 = pd.read_csv("./Data/DonneesOuvertes2023.csv", sep=",")

In [5]:
data_24 = pd.read_csv("./Data/DonneesOuvertes2024.csv", sep=",")

In [6]:
data_23.head()

Unnamed: 0,STARTSTATIONNAME,STARTSTATIONARRONDISSEMENT,STARTSTATIONLATITUDE,STARTSTATIONLONGITUDE,ENDSTATIONNAME,ENDSTATIONARRONDISSEMENT,ENDSTATIONLATITUDE,ENDSTATIONLONGITUDE,STARTTIMEMS,ENDTIMEMS
0,Métro Mont-Royal (Utilités publiques / Rivard),Le Plateau-Mont-Royal,45.524236,-73.581552,Chabot / Mont-Royal,Le Plateau-Mont-Royal,45.534134,-73.573524,1698266696468,1698267000000.0
1,Rielle / Wellington,Verdun,45.460156,-73.567001,St-Jacques / McGill,Ville-Marie,45.501441,-73.560144,1698270535502,1698272000000.0
2,Ste-Catherine / Drummond,Ville-Marie,45.498588,-73.574278,Peel / Ottawa,Le Sud-Ouest,45.4942,-73.559985,1698285705974,1698286000000.0
3,Boyer / du Mont-Royal,Le Plateau-Mont-Royal,45.527432,-73.579917,de l'Hôtel-de-Ville / Rachel,Le Plateau-Mont-Royal,45.519897,-73.580106,1698269489932,1698270000000.0
4,Clark / Ontario,Ville-Marie,45.510625,-73.566903,Wolfe / Robin,Ville-Marie,45.519581,-73.560116,1698270006571,1698270000000.0


In [33]:
data_24.head()

Unnamed: 0,STARTSTATIONNAME,STARTSTATIONARRONDISSEMENT,STARTSTATIONLATITUDE,STARTSTATIONLONGITUDE,ENDSTATIONNAME,ENDSTATIONARRONDISSEMENT,ENDSTATIONLATITUDE,ENDSTATIONLONGITUDE,STARTTIMEMS,ENDTIMEMS
0,Lajeunesse / Villeray,Villeray - Saint-Michel - Parc-Extension,45.542119,-73.622547,Lajeunesse / Villeray,Villeray - Saint-Michel - Parc-Extension,45.542119,-73.622547,1709250944343,1709251000000.0
1,Marché Jean-Talon (Casgrain / Shamrock),Rosemont - La Petite-Patrie,45.53519,-73.615482,6e avenue / Villeray,Villeray - Saint-Michel - Parc-Extension,45.55874,-73.60589,1709244998673,1709249000000.0
2,Square Viger (Berri / Viger),Ville-Marie,45.512212,-73.554629,BAnQ (Berri),Ville-Marie,45.516014,-73.56292,1709251175931,1709252000000.0
3,Métro Laurier (Berri / St-Joseph),Le Plateau-Mont-Royal,45.527231,-73.586726,Berri / Rachel,Le Plateau-Mont-Royal,45.522719,-73.577204,1709226336066,1709227000000.0
4,St-Grégoire / Christophe-Colomb,Le Plateau-Mont-Royal,45.532597,-73.590111,St-Hubert / du Mont-Royal,Le Plateau-Mont-Royal,45.525711,-73.581284,1709220412969,1709221000000.0


# The 2023-2024 Winter Cycle

The BIXI Winter Season runs from November 16 until docks are brought out again sometime in the spring. In the past, this has been mid-April - May 1. This cycle, the docks were brought out the earliest they have ever been due to a rather mild winter. The Winter cycle ran from November 16, 2023 (1700092800000 in Unix time) - April 2, 2024.

At this moment in time, the only public riding data available is up until the end of Feburary 2024. The only data that exists in March is membership data (monthly purchases and one-way/occassionel trips). This data may be incorporated later into the study.

For simplicities sake, we will be making use of the 2024 data that exists up until this point and merging it with the 2023 data that exists after November 16, 2023.

A continuous assumption that we will be making is the following: data used from the 2023 Season, until November 15, 2023, was used in the decision making process for the BIXI docks/stations chosen in the 2023-2024 Winter Pilot.


In [34]:
# sort 2023 data by time
data_23 = data_23.sort_values(by="STARTTIMEMS", ascending=True)

In [35]:
# getting the data November 16 onward from the 2023 data and combining it with 2024 data to get the full data for the winter season
winter_23 = data_23[data_23["STARTTIMEMS"] >= 1700092800000].reset_index(drop=True)
winter_data = pd.concat([winter_23, data_24.reset_index(drop=True)])
winter_data.reset_index(drop=True)

# filter out only start-station related columns
winter_data = winter_data[
    [
        "STARTSTATIONNAME",
        "STARTSTATIONARRONDISSEMENT",
        "STARTSTATIONLATITUDE",
        "STARTSTATIONLONGITUDE",
    ]
]

## Combining unlike data

BIXI renames their docks from season to season. Based on the 2024 dock maps, which is how I initially gathered the necessary data through matching the green dots with their counterparts in the app, the data I have is different from the data in the 2023 station name columns. Their lat/lon coordinates probably differ as well. I will now rename the 2023 columns from the winter data to match the names in the 2024 data.


In [36]:
map_docks = pd.read_csv("./Data/StationsWinter2024RealFromMap.csv")
diff = []
# convert data to list to compare properly, had issues comparing df columns to each other
for station_name in winter_23["STARTSTATIONNAME"].to_list():
    if (
        station_name not in map_docks["STATION_NAME"].to_list()
        and station_name not in diff
    ):
        diff.append(station_name)
diff_df = pd.DataFrame(diff)
diff_df = diff_df.dropna()
diff = diff_df[0].to_list()
diff_df.to_csv("./Data/dock_name_differences.csv")

## Misleading data

Filtering the station names in the 2023 data reveals my main concern: that dock names have their cross streets mixed. If this were the only case, it would be an easy fix.

However, this data filtering revealed something that I had not expected to find in the data, but that I experienced myself on an occasionnel ride I took around the end of September from Jeanne-Mance / Laurier to Aylmer / Sherbrooke.

BIXI states the following in their blog post promoting the year long pilot
"Stations outside this pilot zone will no longer be available after 11:59pm on November 15. However, to the delight of BIXISTS, some stations on the outskirts of the pilot zone may remain available longer than usual, until they are withdrawn."

In my experience, the stations I used were well within the pilot limits. To ensure the data is as accurate as possible, I will filter out the rides that **do not** match with the stations in the 2024 winter/year long map.

Later on, I will swap the station names and coordinates so that the data can be as cohesive as possible.


In [37]:
# separate station names into tuples
# the stations here are the differences between the latter 2023 data and current 2024 data
station_streets_separated = []
for idx, station_name in enumerate(diff):
    if station_name in [
        "Lapierre",
        "Casino de Montréal",
        "Parc de Dieppe",
        "Place du Commerce",
        "Métro Longueuil - Université de Sherbrooke",
    ]:
        station_streets_separated.append([station_name, " "])
    elif "(" in station_name:
        if "sud" in station_name or "Dutrisac" in station_name:
            if "/" in station_name:
                sub_1, sub_2 = station_name.split(sep="/")
                station_streets_separated.append([sub_1.strip(), sub_2.strip()])
                continue
            sub = station_name[0 : station_name.find("(")]
            sub = sub.strip()
            station_streets_separated.append([sub, " "])
            continue
        sub = station_name[station_name.find("(") + 1 : station_name.rfind(")")]
        sub = sub.strip()
        if sub in [
            "Lapierre",
            "Chemin du Chenal le Moyne",
            "Parc de Dieppe",
            "Léo-Lacombe",
            "Parc St-Laurent",
        ]:
            station_streets_separated.append([sub, " "])
        else:
            sub_1, sub_2 = sub.split(sep="/")
            sub_1 = sub_1.strip()
            sub_2 = sub_2.strip()
            station_streets_separated.append([sub_1, sub_2])
    else:
        station_streets_separated.append(station_name.split(sep="/"))

In [38]:
pd.DataFrame(station_streets_separated).to_csv("./Data/dock_name_differences.csv")

In [39]:
contains_street_names = []
for street_names in station_streets_separated:
    if len(street_names) == 3:
        street_1, street_2 = street_names[0].strip(), street_names[2].strip()
        mask = winter_data["STARTSTATIONNAME"].str.contains(
            street_1, na=False
        ) | winter_data["STARTSTATIONNAME"].str.contains(street_2, na=False)
        contains_street_names.append(mask)
    else:
        street_1 = street_names[0].strip()
        mask = winter_data["STARTSTATIONNAME"].str.contains(street_1, na=False)
        contains_street_names.append(mask)

In [40]:
filtered_winter_data_individualized = []
for mask in contains_street_names:
    winter_copy = winter_data.copy()
    masked = winter_copy[mask]
    filtered_winter_data_individualized.append(masked)

In [41]:
# apply filter to new winter data objects then merge all together
only_map_data = pd.concat(filtered_winter_data_individualized, axis=0)
only_map_data = only_map_data.drop_duplicates().reset_index()
# TODO get rid of this when finished checking
only_map_data.to_csv("./Data/dock_names_might_have_same_crossstreet.csv")

In [104]:
def split_station_names(station: str) -> list:
    if ("(" and "/" in station) and (station.find("(") < station.find("/")):
        left_par_idx = station.find("(")
        cross_streets = station[left_par_idx + 1 : len(station)]
        streets_split = cross_streets.split("/")
        street_1, street_2 = streets_split[0].strip(), streets_split[1].strip()
        if ")" in street_2:
            street_2 = street_2[: len(street_2) - 1]
        return [street_1, street_2]
    elif "/" in station:
        station_split = station.split("/")
        street_1, street_2 = station_split[0].strip(), station_split[1].strip()
        return [street_1, street_2]
    else:
        return [station]

In [116]:
map_docks_street_pairs = {}
for idx, station in map_docks.iterrows():
    streets = split_station_names(str(station["STATION_NAME"]))
    for street in streets:
        if street not in map_docks_street_pairs:
            map_docks_street_pairs[street] = []
        map_docks_street_pairs[street].append(streets)

In [125]:
def reorder_street_names(station: str, reordered_streets: list) -> str:
    if ("(" and "/" in station) and (station.find("(") < station.find("/")):
        left_par_idx = station.find("(")
        tmp = station[:left_par_idx]
        tmp += "("
        tmp += "/".join(reordered_streets)
        tmp += ")"
        return tmp
    elif "/" in station:
        return "/".join(reordered_streets)
    else:
        return station

In [126]:
# remove these stations from the data
# check if 2023 station cross-streets need to be swapped to match 2024 data
for idx, station in only_map_data.iterrows():
    streets_split = split_station_names(station["STARTSTATIONNAME"])

    if len(streets_split) == 2:
        # print("BEFORE:", streets_split)
        street_1, street_2 = streets_split
        if street_1 in map_docks_street_pairs:
            for street_pairs in map_docks_street_pairs[street_1]:
                if street_1 and street_2 in street_pairs:
                    if streets_split != street_pairs:
                        streets_split = [street_2, street_1]
                        # print("AFTER:", " / ".join(streets_split))
                        only_map_data.iat[idx, 0] = reorder_street_names(
                            station["STARTSTATIONNAME"], streets_split
                        )
        # elif street_2 in map_docks_street_pairs:
        #     if map_docks_street_pairs[street_2] != streets_split:
        #         streets_split = [street_1, street_2]
        #         print(" / ".join(streets_split))
only_map_data.to_csv("./Data/reordered")

ValueError: Must have equal len keys and value when setting with an iterable