In [27]:
import pandas as pd
from glob import glob
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import cross_validate,cross_val_score,train_test_split,KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error

import optuna


Functions for the full deployment:

In [4]:
def adding_extra_features(df: pd.DataFrame, pickup_date_column: str, dropoff_date_column: str) -> pd.DataFrame:
    """
    This function takes a DataFrame and adds extra features related to time, duration, and velocity.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing taxi trip data.
    - pickup_date_column (str): The name of the column containing pickup timestamps.
    - dropoff_date_column (str): The name of the column containing dropoff timestamps.

    Returns:
    - df (pd.DataFrame): The DataFrame with added features.

    Additional Features Added:
    - Time-related features for both pickup and dropoff timestamps (year, week, month, day of week, hour, minute, day of year).
    - Duration of the trip in minutes.
    - Velocity  of the trip in km per hour.
    """
    # Convert the pickup date column to a DatetimeIndex
    pickup_dt_index = pd.DatetimeIndex(df[pickup_date_column])
    
    # Convert the dropoff date column to a DatetimeIndex
    dropoff_dt_index = pd.DatetimeIndex(df[dropoff_date_column])
    
    # Add new columns to the DataFrame

    #adding the time features of the pickup and dropoff timestamp
    df['year_pickup'] = pickup_dt_index.year
    df['week_pickup'] = df[pickup_date_column].apply(lambda x: x.isocalendar()[1])
    df['month_pickup'] = pickup_dt_index.month
    df['day_of_week_pickup'] = pickup_dt_index.weekday
    df['hour_pickup'] = pickup_dt_index.hour
    df['minute_pickup'] = pickup_dt_index.minute
    df['dayofyear_pickup'] = pickup_dt_index.dayofyear

    df['year_dropoff'] = dropoff_dt_index.year
    df['week_dropoff'] = df[dropoff_date_column].apply(lambda x: x.isocalendar()[1])
    df['month_dropoff'] = dropoff_dt_index.month
    df['day_of_week_dropoff'] = dropoff_dt_index.weekday
    df['hour_dropoff'] = dropoff_dt_index.hour
    df['minute_dropoff'] = dropoff_dt_index.minute
    df['duration_minutes'] = (df[dropoff_date_column] - df[pickup_date_column]).dt.total_seconds() / 60
    df['dayofyear_dropoff'] = dropoff_dt_index.dayofyear

    #adding the duration of the trip
    df['duration_minutes'] = (dropoff_dt_index - pickup_dt_index).total_seconds() / 60

    #adding the velocity in km/h 
    df['veloc']=df['trip_distance']/(df['duration_minutes']/60)
    
    return df



def drop_columns(df: pd.DataFrame, columns_to_drop: list=None) -> pd.DataFrame:
    """
    Drops specified columns from a DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_drop (list): A list of column names to be dropped. Default is None.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns dropped.
    """
    
    df = df.drop(columns=columns_to_drop, errors='ignore')

    return df

In [7]:
# getting all 2022 data and merge it

parquet_file_pattern = 'yellow_tripdata_2022-*.parquet'


parquet_file_list = glob(parquet_file_pattern)


all_data = pd.DataFrame()


for parquet_file in parquet_file_list:
    df = pd.read_parquet(parquet_file, engine='pyarrow')
    all_data = pd.concat([all_data, df], ignore_index=True)

KeyboardInterrupt: 

In [39]:
# getting just the jan data 
df = pd.read_parquet('yellow_tripdata_2022-01 (1).parquet', engine='pyarrow')

In [40]:

df_extra_features=adding_extra_features(df,'tpep_pickup_datetime','tpep_dropoff_datetime')
df_extra_features=df_extra_features[(df_extra_features['veloc']>0) & (df_extra_features['veloc']<=70)]

Adding the weather features

In [7]:
#weather data only of jan 2022
weather= pd.read_csv('New york 2022-01-01 to 2022-01-31.csv')

# only take a subset of the weather data:
weather=weather[['precip','snow','tempmin','temp','datetime']]
df['datetime'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.date
df['datetime'] = pd.to_datetime(df['datetime'])
weather['datetime'] = pd.to_datetime(weather['datetime'])

#merge it with the whole dataset
df_extra_features_weather=pd.merge(df_extra_features,weather,how='inner', on='datetime')


In [41]:
# deleting features we dont need 
columns_to_drop = [
    'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 
    'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 
    'total_amount', 'congestion_surcharge', 'airport_fee', 
    'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'VendorID','datetime',
   'month_dropoff','dayofyear_dropoff','year_dropoff','week_dropoff'
]
df=drop_columns(df_extra_features,columns_to_drop)

In [42]:
df.head()

Unnamed: 0,passenger_count,trip_distance,PULocationID,DOLocationID,year_pickup,week_pickup,month_pickup,day_of_week_pickup,hour_pickup,minute_pickup,dayofyear_pickup,day_of_week_dropoff,hour_dropoff,minute_dropoff,duration_minutes,veloc
0,2.0,3.8,142,236,2022,52,1,5,0,35,1,5,0,53,17.816667,12.797007
1,1.0,2.1,236,42,2022,52,1,5,0,33,1,5,0,42,8.4,15.0
2,1.0,0.97,166,166,2022,52,1,5,0,53,1,5,1,2,8.966667,6.490706
3,1.0,1.09,114,68,2022,52,1,5,0,25,1,5,0,35,10.033333,6.518272
4,1.0,4.3,68,163,2022,52,1,5,0,36,1,5,1,14,37.533333,6.87389


In [43]:
def change_to_categorical(df: pd.DataFrame, columns_to_change: list=None) -> pd.DataFrame:
    """
    Changes specified columns to categorical type.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_change (list): A list of column names to be changed to categorical type. Default is None.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns changed to categorical type.
    """
    
    df[columns_to_change] = df[columns_to_change].astype('category')

    return df


def train_test_split_x_y(df: pd.DataFrame, test_size: float=0.2, target_column: str='duration_minutes') -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
    """
    Splits data into train and test sets and also split the respective dataframes into x and y.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - test_size (float): The proportion of the dataset to include in the test split. Default is 0.2.
    - target_column (str): The name of the target column (in our case the default is duration_minutes).

    Returns:
    - x_train (pd.DataFrame): The DataFrame containing all features for the training set.
    - x_test (pd.DataFrame): The DataFrame containing all features for the test set.
    - y_train (pd.Series): The Series containing the target column for the training set.
    - y_test (pd.Series): The Series containing the target column for the test set.
    """
    # split the data into x and y and drop the target column from x 
    x= df.drop(columns=target_column)
    y = df[target_column]
    # Split the data into train and test sets
    # we dont shuffle because we have time series data!
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=False)
    
    return x_train, x_test, y_train, y_test

In [44]:
df1=change_to_categorical(df,['PULocationID','DOLocationID','year_pickup','week_pickup','month_pickup','day_of_week_pickup','hour_pickup','minute_pickup','dayofyear_pickup','day_of_week_dropoff','hour_dropoff'])

In [45]:
df1.dtypes

passenger_count         float64
trip_distance           float64
PULocationID           category
DOLocationID           category
year_pickup            category
week_pickup            category
month_pickup           category
day_of_week_pickup     category
hour_pickup            category
minute_pickup          category
dayofyear_pickup       category
day_of_week_dropoff    category
hour_dropoff           category
minute_dropoff         category
duration_minutes        float64
veloc                   float64
dtype: object

In [46]:
x_train, x_test, y_train, y_test=train_test_split_x_y(df1,0.2,'duration_minutes')

In [63]:
def xgboost_model() -> xgb.XGBRegressor:
    """
    Sets up an XGBoost model, the model is not trained yet. The different parameters are meant to be ajusted inside the function. We take the best results from our optimization function.

    Returns:
    - model (xgb.XGBRegressor): The untrained XGBoost model.
    """
    # Create the model

    parameters = {
              'objective': 'reg:squarederror',
              'learning_rate': 0.05,
              'max_depth': 10,
              'min_child_weight': 1,
              'subsample': 0.8,
              'colsample_bytree': 0.7,
              'n_estimators': 500,
              'reg_alpha': 0.1,
              'seed': 1337}
                  

    model = xgb.XGBRegressor(**parameters,enable_categorical=True)
    
    return model

In [66]:
def train_and_evaluate( x_train: pd.DataFrame, y_train: pd.Series) -> dict:
    """
    Trains a model and evaluates it with cv on the train set.

    Parameters:
    - model (xgb.XGBRegressor): The untrained XGBoost model.
    - x_train (pd.DataFrame): The DataFrame containing all features for the training set.
    - y_train (pd.Series): The Series containing the target column for the training set.
 

    Returns:
    - scores (dict of float arrays): Array of scores of the estimator for each run of the cross validation. A dict of arrays containing the score/time arrays for each scorer is returned.
    """
    # Define the 5-fold cross-validation, but again without shuffling because we have time series data!
    kfold = KFold(n_splits=5, shuffle=False)
    # create the model instance
    model = xgboost_model()

       

    # Define the different scoring metrics
    scoring = {'mse': make_scorer(mean_squared_error,greater_is_better=False),
           'mae': make_scorer(mean_absolute_error,greater_is_better=False)}

    # Calculate the cross-validation scores
    scores = cross_validate(model, x_train, y_train, cv=kfold, scoring=scoring)
       # take  abs to get the real value of the error
    scores['test_mse'] = abs(scores['test_mse'])

    scores['test_mae'] = abs(scores['test_mae'])


    return scores

In [None]:
a=train_and_evaluate(x_train[:1000],y_train[:1000])

{'fit_time': array([2.24756289, 1.89795852, 1.80282283, 1.71757746, 2.02229261]), 'score_time': array([0.03228879, 0.01562119, 0.01562619, 0.03440571, 0.03885794]), 'test_mse': array([7700.74984792,  734.09375577, 7966.72292196, 3019.08900987,
       1572.51455312]), 'test_mae': array([13.0586932 ,  5.38195719, 10.03008315,  7.55106469,  6.31821431])}


In [37]:
def hyperparameter_optimization(x_train: pd.DataFrame, y_train: pd.Series)-> dict:
    """
    Optimizes the hyperparameters of an XGBoost model, with a internal cross validation.
    
    Parameters:
    - x_train (pd.DataFrame): The DataFrame containing all features for the training set.
    - y_train (pd.Series): The Series containing the target column for the training set.
    
    
    Returns:
    - study.best_params (dict): A dict with the best parameters for the model.
    """
    


    # the objective function for the hyperparameter optimization
    def objective(trial, x_train, y_train):
    # Define the hyperparameters space
        parameter = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_uniform('gamma', 0, 0.5),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1.0),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1.0),
    
        }

        # Create the XGBoost model
        model = xgb.XGBRegressor(**parameter,enable_categorical=True, random_state=42)

        # Perform 5-fold cross-validation
        kfold = KFold(n_splits=5, shuffle=False)
        scores = cross_val_score(model, x_train, y_train, cv=kfold, scoring='neg_mean_squared_error')

        # Return the mean score
        return scores.mean()
    # create the optuna study
    study = optuna.create_study(direction='minimize')
    # optimize the study
    study.optimize(lambda trial: objective(trial, x_train, y_train), n_trials=100)

    return study.best_params


    

In [47]:
hyperparameter_optimization(x_train, y_train)

[I 2023-12-28 17:25:47,927] A new study created in memory with name: no-name-d7a13bf8-035e-48ae-8702-6cd1afb8ae2e
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'gamma': trial.suggest_uniform('gamma', 0, 0.5),
  'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1.0),
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1.0),
[W 2023-12-28 17:26:43,828] Trial 0 failed with parameters: {'n_estimators': 874, 'max_depth': 3, 'learning_rate': 0.0802182722185934, 'subsample': 0.7406021127521999, 'colsample_bytree': 0.9018974572166406, 'gamma': 0.2978557113830622, 'reg_alpha': 0.07027311587120322, 'reg_lambda': 0.21572182257505002} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\scj41115\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", 

KeyboardInterrupt: 