In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
from pathlib import Path
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
def get_parquet():
    cwd = os.getcwd()
    print( "Current Path:", cwd )
    os.chdir('../data')
    data_dir = Path(os.getcwd() +'/parquet/')

    # 2023 trip data dataframe
    tripdata_2023_df = pd.concat(
        pd.read_parquet(parquet_file)
        for parquet_file in data_dir.glob('*.parquet')
    )

    return tripdata_2023_df 

In [16]:
def perform_cleanup(df):
    print(df.isnull().sum())

    

    return df

In [3]:
def perform_timeseries_forecasting(df):
    # Preprocess the data
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df.set_index('tpep_pickup_datetime', inplace=True)

    numeric_columns = df.select_dtypes(include=[np.number]).columns
    print(numeric_columns)
    df[numeric_columns] = df[numeric_columns].resample('1H').mean().fillna(0)
    
    #df = df.resample('1H').mean().fillna(0)  # Resample to hourly and fill NaN values

    # Splitting data into train and test sets
    train_data, test_data = train_test_split(df, test_size=0.2, shuffle=False)

    # Define SARIMA parameters
    #order = (2, 1, 1)  # ARIMA(p, d, q)
    #seasonal_order = (1, 0, 1, 24)  # SARIMA(P, D, Q, m)
    order = (1, 1, 1)  # ARIMA(p, d, q)
    seasonal_order = (0, 1, 1, 12)  # SARIMA(P, D, Q, m)


    # Fit SARIMA model
    sarima_model = SARIMAX(train_data['fare_amount'], order=order, seasonal_order=seasonal_order, enforce_stationarity=False, enforce_invertibility=False)
    sarima_result = sarima_model.fit()
    sarima_result

    # Make predictions
    predicted = sarima_result.predict(start=test_data.index[0], end=test_data.index[-1], dynamic=True)
    predicted

    # Evaluate the model
    mse = mean_squared_error(test_data['fare_amount'], predicted)
    print(f'Mean Squared Error: {mse}')

    # Plot actual vs. predicted fares
    plt.figure(figsize=(12, 6))
    plt.plot(test_data.index, test_data['fare_amount'], label='Actual')
    plt.plot(test_data.index, predicted, label='Predicted', color='red')
    plt.title('Actual vs. Predicted Fare Amount (SARIMA)')
    plt.xlabel('Date')
    plt.ylabel('Fare Amount')
    plt.legend()
    plt.show()


In [8]:
def main():
    # Get parquet
    print("#1 Read parquet")
    taxi_df = get_parquet()
    print(taxi_df.info())

    # Perform cleanup
    print("#2 Perform cleanup")
    cleaned_df = perform_cleanup(taxi_df)

    # Perform time series forecasting
    #print("#3 Perform time series forecasting")
    #perform_timeseries_forecasting(cleaned_df)

In [11]:
main()

#1 Read parquet
Current Path: C:\Users\denni\Documents\Lambton\2nd term\BDM 3014 - Introduction to AI\project\nyc-taxi-fare-prediction\scripts
<class 'pandas.core.frame.DataFrame'>
Index: 38310226 entries, 0 to 3376566
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_

In [4]:
 taxi_df = get_parquet()

Current Path: C:\Users\denni\Documents\Lambton\2nd term\BDM 3014 - Introduction to AI\project\nyc-taxi-fare-prediction\scripts


In [5]:
taxi_df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


In [16]:
new_df = perform_cleanup(taxi_df)
#new_df['fare_amount_numeric'] = pd.to_numeric(new_df['fare_amount'], errors='coerce')

#filtered_df = new_df[new_df['fare_amount_numeric'].isna()]
#filtered_df

#print(new_df['tpep_pickup_datetime'].dtype) 

NameError: name 'perform_cleanup' is not defined

In [6]:
#taxi_df = get_parquet()
perform_timeseries_forecasting(taxi_df)

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  warn('Too few observations to estimate starting parameters%s.'


MemoryError: Unable to allocate 13.3 GiB for an array with shape (27, 27, 2453413) and data type float64

In [6]:
#new_df['tpep_pickup_datetime'].dtype
#taxi_df[taxi_df['fare_amount']=='NNNNN']
#taxi_df
taxi_df['tpep_pickup_datetime']

0         2023-01-01 00:32:10
1         2023-01-01 00:55:08
2         2023-01-01 00:25:04
3         2023-01-01 00:03:48
4         2023-01-01 00:10:29
                  ...        
3403761   2023-03-31 23:24:25
3403762   2023-03-31 23:24:50
3403763   2023-03-31 23:26:31
3403764   2023-03-31 23:07:51
3403765   2023-03-31 23:26:12
Name: tpep_pickup_datetime, Length: 9384487, dtype: datetime64[us]

In [4]:
import pandas as pd

# Assuming df is your DataFrame
# Check data types of columns
column_data_types = taxi_df.dtypes

# Filter columns with non-numeric data types (e.g., object, string)
non_numeric_columns = column_data_types[column_data_types == 'object'].index

# Check for non-numeric values in non-numeric columns
for col in non_numeric_columns:
    non_numeric_values = taxi_df[col].loc[~pd.to_numeric(taxi_df[col], errors='coerce').notna()]
    if not non_numeric_values.empty:
        print(f'Column "{col}" contains non-numeric values:')
        print(non_numeric_values)


Column "store_and_fwd_flag" contains non-numeric values:
0             N
1             N
2             N
3             N
4             N
           ... 
3376562    None
3376563    None
3376564    None
3376565    None
3376566    None
Name: store_and_fwd_flag, Length: 38310226, dtype: object
