This file was used to retrieve the station data, modify some of its fields and convert the file type to a JSON format suitable for our MongoDB database.

In [None]:
import pandas as pd
import json

In [None]:
# Read station CSV into DataFrame and cast "total docks" to an int
df = pd.read_csv("./bikes_data/station_data/current_bluebikes_stations.csv", skiprows=[0])
df["Total Docks"] = df["Total Docks"].astype(int)
df.info()

In [None]:
stations_dict = {}

# For each station
for i in range(len(df)):
    row = df.iloc[i]
    temp_name = row["NAME"]
    # Create JSON-friendly object containing relevant fields
    temp = {
    "name": temp_name,
    "coordinates" : [row["Long"], row["Lat"]],
    "municipality":row["Municipality"],
    "total_docks":row["Total Docks"],
    "seasonal_status":row["Seasonal Status"]}

    stations_dict[temp_name] = temp

In [None]:
def add_missing_stations(df, start_station_name, end_station_name, start_lat, start_lon, end_lat, end_lon, stations_dict):
    """ Adds missing stations from historical files to our list of station objects """
    # For each station in this file,
    for i in range(len(df)):
        row = df.iloc[i]

        # Track start station if not accounted for already
        start_station = row[start_station_name]
        if start_station not in stations_dict:
            # Extract relevant historical information which is still in use
            temp = {
                "name":start_station,
                "coordinates" : [row[start_lon], row[start_lat]]
            }
            stations_dict[start_station] = temp

         # Track end station if not accounted for already
        end_station = row[end_station_name]
        if start_station not in stations_dict:
            # Extract relevant historical information which is still in use
            temp = {
                "name":end_station,
                "coordinates" : [row[end_lon], row[end_lat]],
            }
            stations_dict[end_station] = temp

    
    return stations_dict


In [None]:
""" Reading in 2024 data """
# For January, February, and March
for idx in list(range(1,4)):
    if idx < 10:
        idx = "0" + str(idx)
    else:
        idx = str(idx)
    # Read in monthly CSV file to DataFrame
    df = pd.read_csv(f"./bikes_data/trips_data/2024{idx}-bluebikes-tripdata.csv")
    print(idx)
    # Extract missing stations
    stations_dict =  add_missing_stations(df=df, start_station_name="start_station_name", end_station_name="end_station_name", 
                                           start_lat="start_lat", start_lon="start_lng",
                                          end_lat="end_lat", end_lon="end_lng", stations_dict=stations_dict)


""" Reading in April - December 2023 data """
# For April – December
for idx in list(range(4,13)):
    if idx < 10:
        idx = "0" + str(idx)
    else:
        idx = str(idx)
    # Read in monthly CSV file to DataFrame
    df = pd.read_csv(f"./bikes_data/trips_data/2023{idx}-bluebikes-tripdata.csv")
    print(idx)
    # Extract missing stations
    stations_dict = add_missing_stations(df=df, start_station_name="start_station_name", end_station_name="end_station_name", 
                                          start_lat="start_lat", start_lon="start_lng",
                                          end_lat="end_lat", end_lon="end_lng", stations_dict=stations_dict)


In [None]:
""" Reading in station data back to 2019 """
for year in [2019, 2020, 2021, 2022, 2023]:
    for idx in list(range(1,13)):
        if year == 2023 and idx == 4:  # Headers change starting in 04/2023
            break
        if idx < 10:
            idx = "0" + str(idx)
        else:
            idx = str(idx)
        # Read in monthly CSV file to DataFrame
        df = pd.read_csv(f"./bikes_data/trips_data/{year}{idx}-bluebikes-tripdata.csv")
        # Extract missing stations
        stations_dict = add_missing_stations(df=df, start_station_name="start station name", end_station_name="end station name", 
                                          start_lat="start station latitude", start_lon="start station longitude", 
                                          end_lat="end station latitude", end_lon="end station longitude", stations_dict=stations_dict)

In [None]:
# Write station object list to JSON file
with open("stations_2.json", "w") as file:
    json.dump(stations_dict, file)