This file reads in the 15 million rows of CSV trip data and creates JSONs to be inserted into the MongoDB database.

In [None]:
import pandas as pd
import json 

In [13]:
# Create station object list from our stations.json file
stations = {}
with open("stations.json", "r") as file:
    stations = json.load(file)

In [14]:
""" Adds a month of rides to the JSON file """
def add_month_of_rides(df, rides, subscriber, start_time, end_time, start_station, end_station, 
                       subscribe_convention, bike_id=None, bike_type=None):
    # Cast start and end times to datetimes for easy accessibility
    df[start_time] = pd.to_datetime(df[start_time], format="%Y-%m-%d %H:%M:%S")
    df[end_time] = pd.to_datetime(df[end_time], format="%Y-%m-%d %H:%M:%S")

    # Drop rows with null station values and cast subscriber status to binary feature
    df = df.dropna(subset=[start_station, end_station])
    df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no

    # For every ride
    for i in range(len(df)):
        row = df.iloc[i]

        # Validate that start_station and end_station are accounted for
        if row[start_station] not in stations:
            continue
        elif row[end_station] not in stations:
            continue
        
        # Extract relevant, consistent features
        temp = {
            "subscribed": row[subscriber],
            "start_time":str(row[start_time]),
            "end_time":str(row[end_time]),
            "start_station": stations[row[start_station]],
            "end_station": stations[row[end_station]]
        }

        # In certain months, extra features are available
        # Extract those if they exist
        if bike_id is not None:
            temp["bike_id"] = float(row[bike_id])
        if bike_type is not None:
            temp["bike_type"] = row[bike_type]
        
        rides.append(temp)

    return rides

In [15]:
rides = []
""" Store 2024 ride data """
year = 2024
for idx in list(range(1,4)):
    if idx < 10:
            idx = "0" + str(idx)
    else:
        idx = str(idx)
    # Read in CSV file as DataFrame
    df = pd.read_csv(f"./bikes_data/trips_data/{year}{idx}-bluebikes-tripdata.csv")
    # Extract rides
    rides = add_month_of_rides(df=df, rides=rides, subscriber="member_casual",
                            start_time="started_at", end_time="ended_at",
                            start_station="start_station_name",
                            end_station="end_station_name",
                            subscribe_convention="member",
                            idx=idx, bike_id=None, bike_type="rideable_type")
    print("finised file: 2024" , str(idx))

print("finished year, length:", str(len(rides)))

# Write 2024 ride JSON file
with open("rides2024.json", "w") as file:
    json.dump(rides, file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 01


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 02


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 03
finished year, length: 658550


In [17]:
""" Store ride data from 2019 to April 2023 (architecture changes 04/2023)"""
rides = []
for year in [2019, 2020, 2021, 2022, 2023]:
    # For each month of the year
    for idx in list(range(1,13)):
        if year == 2023 and idx == 4:  # Headers change starting in 04/2023
            break
        if idx < 10:
            idx = "0" + str(idx)
        else:
            idx = str(idx)
        # Read in CSV file as DataFrame
        df = pd.read_csv(f"./bikes_data/trips_data/{year}{idx}-bluebikes-tripdata.csv")
        # Extract ride data
        rides = add_month_of_rides(df=df, rides=rides, subscriber="usertype",
                                start_time="starttime", end_time="stoptime",
                                start_station="start station name",
                                end_station="end station name",
                                subscribe_convention="Subscriber",
                                idx=idx, bike_id="bikeid")
        print("finised file:", str(year) , str(idx))
        
    print("finished year, length:", str(len(rides)))
    # Write {year} ride JSON file
    with open("rides"+str(year)+".json", "w") as file:
        json.dump(rides, file)
    rides=[]
        

""" Read in remaining 2023 data (post-April architecture change) """
year = 2023
# For each month April - December
for idx in list(range(4,13)):
    if idx < 10:
            idx = "0" + str(idx)
    else:
        idx = str(idx)
    # Read in CSV file as DataFrame
    df = pd.read_csv(f"./bikes_data/trips_data/{year}{idx}-bluebikes-tripdata.csv")
    # Extract ride data
    rides = add_month_of_rides(df=df, rides=rides, subscriber="member_casual",
                            start_time="started_at", end_time="ended_at",
                            start_station="start_station_name",
                            end_station="end_station_name",
                            subscribe_convention="member",
                            idx=idx, bike_id=None, bike_type="rideable_type")
    print("finised file: 2024" , str(idx))

print("finished year, length:", str(len(rides)))
# Write 2023 part 2 ride JSON file
with open("rides2023_2.json", "w") as file:
    json.dump(rides, file)
rides=[]

finised file: 2019 01
finised file: 2019 02
finised file: 2019 03
finised file: 2019 04
finised file: 2019 05
finised file: 2019 06
finised file: 2019 07
finised file: 2019 08
finised file: 2019 09
finised file: 2019 10
finised file: 2019 11
finised file: 2019 12
finished year, length: 2522769
finised file: 2020 01
finised file: 2020 02
finised file: 2020 03
finised file: 2020 04
finised file: 2020 05
finised file: 2020 06
finised file: 2020 07
finised file: 2020 08
finised file: 2020 09
finised file: 2020 10
finised file: 2020 11
finised file: 2020 12
finished year, length: 2073447
finised file: 2021 01
finised file: 2021 02
finised file: 2021 03
finised file: 2021 04
finised file: 2021 05
finised file: 2021 06
finised file: 2021 07
finised file: 2021 08
finised file: 2021 09
finised file: 2021 10
finised file: 2021 11
finised file: 2021 12
finished year, length: 2934378
finised file: 2022 01
finised file: 2022 02
finised file: 2022 03
finised file: 2022 04
finised file: 2022 05
finis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 04


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 05


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 06


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 07


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 08


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 09


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[subscriber] = df[subscriber].apply(lambda sub: sub == subscribe_convention).astype(int).astype(float)  # Binary: subbed or no? 1=yes, 0=no


finised file: 2024 12
finished year, length: 3183627
