In [2]:
import pandas as pd
import numpy as np

df = pd.read_parquet('../data/filtered/filtered_yellow_tripdata_2022-12.parquet')

In [3]:
df

Unnamed: 0,pickup_datetime,PULocationID
0,2022-12-01 00:37:35,170
1,2022-12-01 00:34:35,138
2,2022-12-01 00:33:26,140
3,2022-12-01 00:45:51,141
4,2022-12-01 00:49:49,261
...,...,...
3399544,2022-12-31 23:46:00,16
3399545,2022-12-31 23:13:24,75
3399546,2022-12-31 23:00:49,168
3399547,2022-12-31 23:02:50,238


In [18]:
# Add a column for the pickup hour
df['pickup_hour'] = df['pickup_datetime'].dt.floor('h')

In [23]:
# Group by pickup hour and count the number of trips
df_grouped = df.groupby(['pickup_hour', 'PULocationID']).size().reset_index(name='rides')
df_grouped

Unnamed: 0,pickup_hour,PULocationID,rides
0,2022-12-01 00:00:00,4,2
1,2022-12-01 00:00:00,7,2
2,2022-12-01 00:00:00,10,1
3,2022-12-01 00:00:00,13,1
4,2022-12-01 00:00:00,20,1
...,...,...,...
75715,2022-12-31 23:00:00,261,9
75716,2022-12-31 23:00:00,262,35
75717,2022-12-31 23:00:00,263,98
75718,2022-12-31 23:00:00,264,57


In [39]:
from tqdm import tqdm

# function that add missing slots

def add_missing_slots(df_grouped) -> pd.DataFrame:
    location_ids = df_grouped['PULocationID'].unique()
    full_range = pd.date_range(
        start=df_grouped['pickup_hour'].min(), end=df_grouped['pickup_hour'].max(), freq='h'
    )
    
    output_list = []  # Use a list for better performance
    
    for location_id in tqdm(location_ids):
        # Filter only rides for this location
        df_location = df_grouped.loc[df_grouped['PULocationID'] == location_id, ['pickup_hour', 'rides']]
        
        # Add missing dates with 0 in rides
        df_location = df_location.set_index('pickup_hour').reindex(full_range).fillna(0).reset_index()
        df_location = df_location.rename(columns={'index': 'pickup_hour'})  # Rename the reindexed column
        
        # Add the location ID back
        df_location['PULocationID'] = location_id
        
        # Append to the list instead of concatenating in each iteration
        output_list.append(df_location)
    
    # Concatenate all at once
    output = pd.concat(output_list, ignore_index=True)

    output = output.reset_index(drop=True).rename(columns={'index': 'pickup_hour'})
    
    return output


In [40]:
complete_df_grouped = add_missing_slots(df_grouped)

100%|██████████| 260/260 [00:00<00:00, 518.54it/s]


In [41]:
complete_df_grouped

Unnamed: 0,pickup_hour,rides,PULocationID
0,2022-12-01 00:00:00,0,1
1,2022-12-01 01:00:00,0,1
2,2022-12-01 02:00:00,0,1
3,2022-12-01 03:00:00,0,1
4,2022-12-01 04:00:00,1,1
...,...,...,...
193435,2022-12-31 19:00:00,2,265
193436,2022-12-31 20:00:00,2,265
193437,2022-12-31 21:00:00,7,265
193438,2022-12-31 22:00:00,3,265


In [43]:
# plot the number of trips per hour
import matplotlib.pyplot as plt
import plotly.express as px
from typing import Optional, List

def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
        ):
    """
    Plot the number of rides per hour for the specified locations.
    
    Parameters
    ----------
    rides : pd.DataFrame
        DataFrame with the number of rides per hour and location.
    locations : List[int], optional
        List of locations to plot. If None, plot all locations.

    Returns
    -------
    None
    """
    rides_to_plot = rides.copy()
    rides_to_plot = rides_to_plot[rides_to_plot['PULocationID'].isin(locations)] if locations else rides_to_plot

    fig = px.line(
        rides_to_plot,
        x='pickup_hour',
        y='rides',
        color='PULocationID',
        title='Rides per hour'
        )
    
    fig.show()


In [44]:
plot_rides(complete_df_grouped, locations=[43])