# Instruction
I have a typical project of predicting the NYC uber/lyft trip demand. The dataset is available from Jan2022 to March 2023. The area is already divided into different locations. and I want the predicted demand for each location every 15 mins
## Problem statment
The goal of this project is to predict the demand for Uber/Lyft trips in different locations of NYC every 15 minutes, using a dataset spanning from January 2022 to March 2023. The dataset includes information such as the dispatching base number, pickup datetime, drop-off datetime, pickup location ID, drop-off location ID, SR_Flag, and affiliated base number

In [1]:
import pandas as pd
import glob
import tqdm
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from dateutil.relativedelta import relativedelta
import numpy as np
from pmdarima import auto_arima

In [2]:
data_list_path = glob.glob('Datasets/fhv_tripdata_2022-2023_in_csv/*.csv')

list_df = []
for path in data_list_path:
    print(path)
    # Step 1: Preprocess the Dataset
    df = pd.read_csv(path)
    list_df.append(df)
    
df =  pd.concat(list_df)

interested_features = ['pickup_datetime','PUlocationID']
df = df[interested_features]



Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-09.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-04.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-07.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-06.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-08.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-11.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-12.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-05.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-10.csv


In [5]:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])


Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902


In [8]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from prophet import Prophet

print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])

location_ids = removed_nan_df['PUlocationID'].unique().tolist()

loop_count = 0
for lc_id in location_ids:
    print('Location ID:', lc_id)
    df_subset = removed_nan_df[removed_nan_df['PUlocationID'] == lc_id]
    df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
    df_subset = df_subset.sort_values('pickup_datetime')
    df_subset = df_subset.set_index('pickup_datetime')
    df_subset = df_subset['PUlocationID'].resample('1H').count()
    df_subset = df_subset.reset_index()

    # Split data into training and testing sets
    train_size = int(len(df_subset) * 0.95)
    train_data = df_subset[:train_size]
    test_data = df_subset[train_size:]

    # Prepare data for Prophet model
    prophet_train_data = train_data.rename(columns={'pickup_datetime': 'ds', 'PUlocationID': 'y'})

    # Create and fit the Prophet model
    model = Prophet(
        seasonality_mode='additive',
        daily_seasonality=True,  # Disable daily seasonality
        weekly_seasonality=True,  # Enable weekly seasonality
        yearly_seasonality=False,  # Disable yearly seasonality
    )
    model.fit(prophet_train_data)

    # Generate future dates for prediction
    future_dates = model.make_future_dataframe(periods=len(test_data), freq='H')

    # Make predictions
    forecast = model.predict(future_dates)
    forecast = forecast[['ds', 'yhat']][-len(test_data):]

    # Plotting
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=prophet_train_data['ds'], y=prophet_train_data['y'], mode='lines+markers', name='Training Data'))
    fig.add_trace(go.Scatter(x=test_data['pickup_datetime'], y=test_data['PUlocationID'], mode='lines+markers', name='Testing Data'))
    fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines+markers', name='Prophet Forecast'))
    fig.update_layout(title=f'PickLocation ID: {lc_id} - Facebook Prophet', xaxis_title='Time', yaxis_title='Number Drives')
    fig.show()

    loop_count += 1
    if loop_count > 5:
        break


Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902
Location ID: 12.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

11:58:53 - cmdstanpy - INFO - Chain [1] start processing
11:58:58 - cmdstanpy - INFO - Chain [1] done processing


Location ID: 89.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

11:59:14 - cmdstanpy - INFO - Chain [1] start processing
11:59:17 - cmdstanpy - INFO - Chain [1] done processing


Location ID: 87.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

11:59:35 - cmdstanpy - INFO - Chain [1] start processing
11:59:39 - cmdstanpy - INFO - Chain [1] done processing


Location ID: 230.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

11:59:54 - cmdstanpy - INFO - Chain [1] start processing
11:59:59 - cmdstanpy - INFO - Chain [1] done processing


Location ID: 73.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:00:13 - cmdstanpy - INFO - Chain [1] start processing
12:00:15 - cmdstanpy - INFO - Chain [1] done processing


Location ID: 93.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:00:21 - cmdstanpy - INFO - Chain [1] start processing
12:00:25 - cmdstanpy - INFO - Chain [1] done processing


In [None]:
# df_subset.values

In [None]:
# df_subset

In [None]:
# df = df_subset

# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df = df.set_index('pickup_datetime')

# df['pickups_per_hour'] = df['PUlocationID'].resample('3H').count()
# df

In [None]:
df

In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt

# # Step 1: Preprocess the Dataset
# df = pd.read_csv('Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv')
# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df['dropOff_datetime'] = pd.to_datetime(df['dropOff_datetime'])
# df.set_index('pickup_datetime', inplace=True)

# # Step 2: Resample the Dataset
# demand_15_mints = df[['PUlocationID', 'DOlocationID']].resample('15T').size()
# demand_30_mints = df[['PUlocationID', 'DOlocationID']].resample('30T').size()
# demand_60_mints = df[['PUlocationID', 'DOlocationID']].resample('1h').size()

# # Step 3: Predict the Demand (using your preferred model)

# # Step 4: Visualize the Demand
# demand_15_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()


In [None]:
# # Step 4: Visualize the Demand
# demand_30_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()


In [None]:
# # Step 4: Visualize the Demand
# demand_60_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()


In [None]:
# demand_30_mints