In [26]:
import pandas as pd
from glob import glob
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np

Functions for the full deployment:

In [None]:
def adding_extra_features(df: pd.DataFrame, pickup_date_column: str, dropoff_date_column: str) -> pd.DataFrame:
    """
    This function takes a DataFrame and adds extra features related to time, duration, and velocity.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing taxi trip data.
    - pickup_date_column (str): The name of the column containing pickup timestamps.
    - dropoff_date_column (str): The name of the column containing dropoff timestamps.

    Returns:
    - df (pd.DataFrame): The DataFrame with added features.

    Additional Features Added:
    - Time-related features for both pickup and dropoff timestamps (year, week, month, day of week, hour, minute, day of year).
    - Duration of the trip in minutes.
    - Velocity  of the trip in km per hour.
    """
    # Convert the pickup date column to a DatetimeIndex
    pickup_dt_index = pd.DatetimeIndex(df[pickup_date_column])
    
    # Convert the dropoff date column to a DatetimeIndex
    dropoff_dt_index = pd.DatetimeIndex(df[dropoff_date_column])
    
    # Add new columns to the DataFrame

    #adding the time features of the pickup and dropoff timestamp
    df['year_pickup'] = pickup_dt_index.year
    df['week_pickup'] = df[pickup_date_column].apply(lambda x: x.isocalendar()[1])
    df['month_pickup'] = pickup_dt_index.month
    df['day_of_week_pickup'] = pickup_dt_index.weekday
    df['hour_pickup'] = pickup_dt_index.hour
    df['minute_pickup'] = pickup_dt_index.minute
    df['dayofyear_pickup'] = pickup_dt_index.dayofyear

    df['year_dropoff'] = dropoff_dt_index.year
    df['week_dropoff'] = df[dropoff_date_column].apply(lambda x: x.isocalendar()[1])
    df['month_dropoff'] = dropoff_dt_index.month
    df['day_of_week_dropoff'] = dropoff_dt_index.weekday
    df['hour_dropoff'] = dropoff_dt_index.hour
    df['minute_dropoff'] = dropoff_dt_index.minute
    df['duration_minutes'] = (df[dropoff_date_column] - df[pickup_date_column]).dt.total_seconds() / 60
    df['dayofyear_dropoff'] = dropoff_dt_index.dayofyear

    #adding the duration of the trip
    df['duration_minutes'] = (dropoff_dt_index - pickup_dt_index).total_seconds() / 60

    #adding the velocity in km/h 
    df['veloc']=df['trip_distance']/(df['duration_minutes']/60)
    
    return df



def drop_columns(df: pd.DataFrame, columns_to_drop: list=None) -> pd.DataFrame:
     """
    Drops specified columns from a DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_drop (list): A list of column names to be dropped. Default is None.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns dropped.
    """
     if columns_to_drop:
        df = df.drop(columns=columns_to_drop, errors='ignore')

     return df

In [7]:
# getting all 2022 data and merge it

parquet_file_pattern = 'yellow_tripdata_2022-*.parquet'


parquet_file_list = glob(parquet_file_pattern)


all_data = pd.DataFrame()


for parquet_file in parquet_file_list:
    df = pd.read_parquet(parquet_file, engine='pyarrow')
    all_data = pd.concat([all_data, df], ignore_index=True)

KeyboardInterrupt: 

In [14]:
# getting just the jan data 
df = pd.read_parquet('yellow_tripdata_2022-01 (1).parquet', engine='pyarrow')

In [5]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [15]:

df_extra_features=adding_extra_features(df,'tpep_pickup_datetime','tpep_dropoff_datetime')

In [16]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,dayofyear_pickup,year_dropoff,week_dropoff,month_dropoff,day_of_week_dropoff,hour_dropoff,minute_dropoff,duration_minutes,dayofyear_dropoff,veloc
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,...,1,2022,52,1,5,0,53,17.816667,1,12.797007
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,...,1,2022,52,1,5,0,42,8.4,1,15.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,...,1,2022,52,1,5,1,2,8.966667,1,6.490706
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,...,1,2022,52,1,5,0,35,10.033333,1,6.518272
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,...,1,2022,52,1,5,1,14,37.533333,1,6.87389


Adding the weather features

In [23]:
#weather data only of jan 2022
weather= pd.read_csv('New york 2022-01-01 to 2022-01-31.csv')

# only take a subset of the weather data:
weather=weather[['precip','snow','tempmin','temp','datetime']]
df['datetime'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.date
df['datetime'] = pd.to_datetime(df['datetime'])
weather['datetime'] = pd.to_datetime(weather['datetime'])

#merge it with the whole dataset
df_extra_features_weather=pd.merge(df_extra_features,weather,how='inner', on='datetime')


In [None]:
# deleting features we dont need 
columns_to_drop = [
    'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 
    'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 
    'total_amount', 'congestion_surcharge', 'airport_fee', 
    'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'VendorID'
]
df=drop_columns(df,columns_to_drop)

In [None]:
df.head()