In [282]:
# Section 1: Deep Learning (Keras)
# from keras.models import Sequential
# from keras.layers import Dense, LSTM
from math import sqrt

# Section 2: Time Series Forecasting (Prophet, ARIMA and SARIMAX)
from statsmodels.tsa.arima.model import ARIMA
from pmdarima.arima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet

# Section 3: Utility Libraries
import itertools
from itertools import product
import numpy as np
import pandas as pd
import locale
from math import sqrt
from IPython.display import display, HTML

# Section 4: Data Transformation
from scipy.stats import boxcox
from sklearn.preprocessing import MinMaxScaler

# Section 5: Performance Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.stats.diagnostic import normal_ad
from statsmodels.stats.diagnostic import het_breuschpagan
from sklearn.model_selection import ParameterGrid


# Section 6: Plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns

# ORganize
from statsmodels.tsa.api import ExponentialSmoothing, SARIMAX
from sklearn.metrics import mean_squared_error
from prophet import Prophet
from pmdarima import auto_arima
import pandas as pd
from math import sqrt
import itertools
import pandas as pd
from sklearn.metrics import mean_squared_error
from prophet import Prophet
from math import sqrt
from sklearn.model_selection import ParameterGrid


In [283]:
display(HTML("<style>div.output_scroll { height: 10em; }</style>"))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## project: medallions
- goal: predict number of rides per class

**data prep**

In [544]:
data_reports_monthly = pd.read_csv('data_reports_monthly.csv')
data_reports_monthly.columns = data_reports_monthly.columns.str.strip()


In [545]:
# Make month/year index
data_reports_monthly['Month/Year'] = pd.to_datetime(data_reports_monthly['Month/Year'])
data_reports_monthly.set_index('Month/Year', inplace=True)

# Replace commas
data_reports_monthly = data_reports_monthly.replace({',': ''}, regex=True)

# Convert numeric columns to numeric
cols = data_reports_monthly.columns.drop(['License Class'])
data_reports_monthly[cols] = data_reports_monthly[cols].apply(pd.to_numeric, errors='coerce')

# Calculate Avg Trips per Vehicle
data_reports_monthly['Avg Trips per Vehicle'] = data_reports_monthly['Trips Per Day'] / data_reports_monthly['Vehicles Per Day']


In [546]:
# Create a new DataFrame with relevant columns
df = data_reports_monthly[['License Class', 'Trips Per Day']].copy()

# Calculate total NYC rides per Month/Year and assign the values to 'Total NYC Rides' column
df['Total NYC Rides'] = df.groupby('Month/Year')['Trips Per Day'].sum()

# Calculate the percentage of total rides
df['Percent of Total'] = df['Trips Per Day'] / df['Total NYC Rides']

# Get a list of unique license classes
license_classes = df['License Class'].unique()



In [1]:
# Sort Index
index = pd.to_datetime(df.index)
df = df.sort_index()
print(df.index.is_monotonic_increasing)


NameError: name 'pd' is not defined

# Number of Rides per Class

In [288]:
license_classes = df['License Class'].unique()


In [291]:
# ARIMA 5,1,0 # Test RMSE for Yellow: 11201.132
# ARIMA 3,1,3 # Test RMSE for Yellow: 9333.896039
# AutoArima AIC # Test RMSE for Yellow: 5764.404


In [293]:
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Make a deep copy of df to ensure the original df isn't modified
df_copy = df.copy()

results = []
detailed_results = {}

for license_class in license_classes:
    # Get the time series for this license class
    series = df_copy[df_copy['License Class'] == license_class]['Trips Per Day']
    
    # Split into train and test sets
    train, test = series[:-10], series[-10:]
    
    # Fit model with automatic order selection
    model = auto_arima(train, seasonal=True, trace=False)
    
    # Make predictions
    predictions = model.predict(n_periods=len(test))
    
    # Ensure the test and predictions are of the same size before proceeding.
    if len(test) != len(predictions):
        print(f"Length mismatch for {license_class}, skipping...")
        continue
    
    # Calculate RMSE, MAE, MAPE, Degree of Accuracy
    rmse = sqrt(mean_squared_error(test, predictions))
    mae = mean_absolute_error(test, predictions)
    mape = (abs((test - predictions) / test)).mean() * 100
    degree_of_accuracy = (rmse / test.mean()) * 100

    # Store the results in a dictionary
    result = {
        'License Class': license_class,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'Degree of Accuracy': degree_of_accuracy
    }
    
    # Append the dictionary to the results list
    results.append(result)

    # Create a detailed results DataFrame for this license class
    detailed_df = pd.DataFrame({
        'Predicted': predictions,
        'Actual': test.values,
    })
    detailed_df['Difference'] = detailed_df['Actual'] - detailed_df['Predicted']
    detailed_df['Difference (%)'] = (detailed_df['Difference'] / detailed_df['Actual']) * 100
    detailed_results[license_class] = detailed_df

# Convert the results list to a dataframe
results_df = pd.DataFrame(results)



In [294]:
# Print the summary dataframe
results_df


Unnamed: 0,License Class,RMSE,MAE,MAPE,Degree of Accuracy
0,Yellow,5764.404096,4532.230389,4.368371,5.430944
1,Green,589.261573,466.256284,20.606361,26.271136
2,FHV - Lux Limo,161.507895,148.6,13.85454,14.461667
3,FHV - Livery,3720.427126,3510.4,18.895934,19.517916
4,FHV - High Volume,38866.322578,32347.041311,5.212886,6.41407
5,FHV - Black Car,3404.23468,2962.283349,17.115208,20.346261


In [260]:
# Print the detailed results for each license class
for license_class, detailed_df in detailed_results.items():
    print(f'\nDetailed results for {license_class}:')
    print(detailed_df)



Detailed results for Green:
              Predicted  Actual   Difference  Difference (%)
2022-06-01  2638.590579    2451  -187.590579       -7.653634
2022-07-01  2066.936212    2070     3.063788        0.148009
2022-08-01  2018.986939    2126   107.013061        5.033540
2022-09-01  2231.640092    2300    68.359908        2.972170
2022-10-01  1728.232249    2236   507.767751       22.708755
2022-11-01  1607.044115    2076   468.955885       22.589397
2022-12-01  1830.666623    2336   505.333377       21.632422
2023-01-01  1390.593628    2199   808.406372       36.762454
2023-02-01  1204.668034    2313  1108.331966       47.917508
2023-03-01  1425.259847    2323   897.740153       38.645723

Detailed results for FHV - Livery:
            Predicted  Actual  Difference  Difference (%)
2022-06-01    22572.0   21637      -935.0       -4.321301
2022-07-01    22572.0   20318     -2254.0      -11.093612
2022-08-01    22572.0   18714     -3858.0      -20.615582
2022-09-01    22572.0   20048   

In [None]:
# Why is the model predicting the same value for certain classes

## forecasting subsequent months

In [12]:
# With Test Data

for license_class in license_classes:
    # Get the time series for this license class
    series = df[df['License Class'] == license_class]['Trips Per Day']
    
    # Split into train and test sets
    train, test = series[:-10], series[-10:]
    
    # Set the frequency of the time series
    train.index.freq = 'MS'
    
    # Fit model
    model = ARIMA(train, order=(5,1,0))
    model_fit = model.fit()

    # Make predictions for the next 10 months
    forecast = model_fit.forecast(steps=10)
    
    # Print the forecasted values
    print(f'Forecast for {license_class}:')
    for i, value in enumerate(forecast):
        month = test.index[-1] + pd.DateOffset(months=i+1)
        print(f'{month}: {value:.3f}')
    print()


Forecast for Yellow:
2023-04-01 00:00:00: 114971.043
2023-05-01 00:00:00: 116202.222
2023-06-01 00:00:00: 116135.774
2023-07-01 00:00:00: 116269.654
2023-08-01 00:00:00: 116192.583
2023-09-01 00:00:00: 116144.388
2023-10-01 00:00:00: 116211.349
2023-11-01 00:00:00: 116194.954
2023-12-01 00:00:00: 116197.553
2024-01-01 00:00:00: 116200.727

Forecast for Green:
2023-04-01 00:00:00: 2515.615
2023-05-01 00:00:00: 2495.399
2023-06-01 00:00:00: 2494.431
2023-07-01 00:00:00: 2490.446
2023-08-01 00:00:00: 2490.561
2023-09-01 00:00:00: 2488.207
2023-10-01 00:00:00: 2488.187
2023-11-01 00:00:00: 2487.640
2023-12-01 00:00:00: 2487.538
2024-01-01 00:00:00: 2487.363

Forecast for FHV - Lux Limo:
2023-04-01 00:00:00: 1239.430
2023-05-01 00:00:00: 1272.060
2023-06-01 00:00:00: 1338.819
2023-07-01 00:00:00: 1304.332
2023-08-01 00:00:00: 1289.702
2023-09-01 00:00:00: 1273.675
2023-10-01 00:00:00: 1295.787
2023-11-01 00:00:00: 1297.711
2023-12-01 00:00:00: 1299.481
2024-01-01 00:00:00: 1289.482

Forecas

In [13]:
# No test data

for license_class in license_classes:
    # Get the time series for this license class
    series = df[df['License Class'] == license_class]['Trips Per Day']
    
    # Use the entire series as the training data
    train = series
    
    # Set the frequency of the time series
    train.index.freq = 'MS'
    
    # Fit model
    model = ARIMA(train, order=(5,1,0))
    model_fit = model.fit()

    # Make predictions for the next 10 months
    forecast = model_fit.forecast(steps=10)
    
    # Print the forecasted values
    print(f'Forecast for {license_class}:')
    for i, value in enumerate(forecast):
        month = train.index[-1] + pd.DateOffset(months=i+1)
        print(f'{month}: {value:.3f}')
    print()


Forecast for Yellow:
2023-04-01 00:00:00: 107253.887
2023-05-01 00:00:00: 107889.603
2023-06-01 00:00:00: 107743.346
2023-07-01 00:00:00: 107658.066
2023-08-01 00:00:00: 107983.023
2023-09-01 00:00:00: 107848.175
2023-10-01 00:00:00: 107871.955
2023-11-01 00:00:00: 107893.538
2023-12-01 00:00:00: 107869.692
2024-01-01 00:00:00: 107888.982

Forecast for Green:
2023-04-01 00:00:00: 2353.794
2023-05-01 00:00:00: 2358.614
2023-06-01 00:00:00: 2373.209
2023-07-01 00:00:00: 2375.311
2023-08-01 00:00:00: 2379.341
2023-09-01 00:00:00: 2380.413
2023-10-01 00:00:00: 2381.648
2023-11-01 00:00:00: 2381.877
2023-12-01 00:00:00: 2382.225
2024-01-01 00:00:00: 2382.283

Forecast for FHV - Lux Limo:
2023-04-01 00:00:00: 1158.733
2023-05-01 00:00:00: 1146.277
2023-06-01 00:00:00: 1122.119
2023-07-01 00:00:00: 1130.357
2023-08-01 00:00:00: 1137.287
2023-09-01 00:00:00: 1143.397
2023-10-01 00:00:00: 1136.943
2023-11-01 00:00:00: 1135.167
2023-12-01 00:00:00: 1134.429
2024-01-01 00:00:00: 1137.568

Forecas