In [1]:
import pandas as pd
from glob import glob
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import cross_validate,cross_val_score,train_test_split,KFold
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error,r2_score
import lightgbm as lgb
import optuna


  from .autonotebook import tqdm as notebook_tqdm


Functions for the full deployment:

In [None]:

# importing the cleaned data
df = pd.read_parquet('/kaggle/input/cleaneddatataxi/cleaned_data_all.parquet', engine='pyarrow')

In [43]:


def drop_columns(df: pd.DataFrame, columns_to_drop: list=None) -> pd.DataFrame:
    """
    Drops specified columns from a DataFrame.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_drop (list): A list of column names to be dropped. Default is None.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns dropped.
    """
    
    df = df.drop(columns=columns_to_drop, errors='ignore')

    return df


def change_to_categorical(df: pd.DataFrame, columns_to_change: list=None) -> pd.DataFrame:
    """
    Changes specified columns to categorical type.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_change (list): A list of column names to be changed to categorical type. Default is None.

    Returns:
    - pd.DataFrame: The DataFrame with specified columns changed to categorical type.
    """
    
    df[columns_to_change] = df[columns_to_change].astype('category')

    return df


def train_test_split_x_y(df: pd.DataFrame, test_size: float=0.2, target_column: str='duration_minutes') -> (pd.DataFrame, pd.DataFrame, pd.Series, pd.Series):
    """
    Splits data into train and test sets and also split the respective dataframes into x and y.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - test_size (float): The proportion of the dataset to include in the test split. Default is 0.2.
    - target_column (str): The name of the target column (in our case the default is duration_minutes).

    Returns:
    - x_train (pd.DataFrame): The DataFrame containing all features for the training set.
    - x_test (pd.DataFrame): The DataFrame containing all features for the test set.
    - y_train (pd.Series): The Series containing the target column for the training set.
    - y_test (pd.Series): The Series containing the target column for the test set.
    """
    # split the data into x and y and drop the target column from x 
    x= df.drop(columns=target_column)
    y = df[target_column]
    # Split the data into train and test sets
    # we dont shuffle because we have time series data!
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=False)
    
    return x_train, x_test, y_train, y_test

In [44]:
columns_to_drop = [
    'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'fare_amount', 
    'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 
    'total_amount', 'congestion_surcharge', 'airport_fee', 
    'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'VendorID','datetime',
  'dropoff_year','dropoff_week','dropoff_is_holiday','dropoff_year','dropoff_day','dropoff_hour','dropoff_day_of_week',
    'dropoff_week','dropoff_is_weekend','dropoff_day_of_year','velocity','dropoff_month	'
]
df=drop_columns(df,columns_to_drop)



In [46]:
cat_feat=['pickup_is_holiday','pickup_is_weekend','PULocationID','DOLocationID']
df=change_to_categorical(df,cat_feat)
df['pickup_week'] = df['pickup_week'].astype('int')

In [63]:
def lightgbm_model_trained(train_x,train_y,test_x,test_y) -> callable:
    
   
    params={'n_estimators': 459, 'max_depth': 6, 'learning_rate': 0.29226849730411003, 'lambda_l1': 0.11355922503965646}


    model = lgb.LGBMRegressor(**params)
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],
             eval_metric='l1')
    
    return model
    

def lightgbm_model_untrained() -> callable:
    
   

    params={'n_estimators': 459, 'max_depth': 6, 'learning_rate': 0.29226849730411003, 'lambda_l1': 0.11355922503965646}

    model = lgb.LGBMRegressor(**params)
   
    
    return model

In [66]:
def train_and_evaluate(df,target_column,n_splits) -> dict:
    """
    Trains a model and evaluates it. It includes cross validation and returns the scores.

    Parameters:
      - df (pd.DataFrame): The input DataFrame.
      - target_column (str): The name of the target column (in our case the default is duration_minutes).
      - n_splits (int): Number of folds. Must be at least 2.
 

    Returns:
    - scores (dict of float arrays): Array of scores of the estimator for each run of the cross validation. A dict of arrays containing the score/time arrays for each scorer is returned.
    """
    #model = xgboost_model()

    # Train the model on the training set
    #model.fit(x_train, y_train)
    model=lightgbm_model_untrained()
    # Evaluate the model on the test set

    
    
    kfold = KFold(n_splits=n_splits, shuffle=False)
    # create the model instance
  
    x= df.drop(columns=target_column)
    y = df[target_column]
       

    # Define the different scoring metrics
    scoring = {'mse': make_scorer(mean_squared_error,greater_is_better=False),
           'mae': make_scorer(mean_absolute_error,greater_is_better=False),
              'r2': make_scorer(r2_score)}

    scores = cross_validate(model,x, y, cv=kfold, scoring=scoring)
       # take  abs to get the real value of the error
    scores['test_mse'] = abs(scores['test_mse'])

    scores['test_mae'] = abs(scores['test_mae'])


    return scores

In [1]:
def hyperparameter_optimization(x_train: pd.DataFrame, y_train: pd.Series, x_test,y_test)-> dict:
    """
    Optimizes the hyperparameters of an lightgbm model. No Cv is used because of computational reasons. 
    We optimize the model on the test set and do that with respect to the mse.
    
    Parameters:
    - x_train (pd.DataFrame): The DataFrame containing all features for the training set.
    - y_train (pd.Series): The Series containing the target column for the training set.
    - x_test (pd.DataFrame): The DataFrame containing all features for the test set.
    - y_test (pd.Series): The Series containing the target column for the test set.
    
    
    Returns:
    - study.best_params (dict): A dict with the best parameters for the model.
    """
    


    # the objective function for the hyperparameter optimization
    def objective(trial, x_train, y_train,x_test,y_test):
    # Define the hyperparameters space
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
            'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 0.2),
    
        }
        
       

        # Create the XGBoost model
        model = lgb.LGBMRegressor(**params)

 
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        score = mean_squared_error(y_test, preds)
        return score
        # Return the mean score
        return score
    # create the optuna study
    study = optuna.create_study(direction='minimize')
    # optimize the study
    study.optimize(lambda trial: objective(trial, x_train, y_train,x_test,y_test), n_trials=20)

    return study.best_params

    

NameError: name 'pd' is not defined

In [47]:
a=hyperparameter_optimization(x_train, y_train, x_test,y_test)

[I 2023-12-28 17:25:47,927] A new study created in memory with name: no-name-d7a13bf8-035e-48ae-8702-6cd1afb8ae2e
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'gamma': trial.suggest_uniform('gamma', 0, 0.5),
  'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1.0),
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 1.0),
[W 2023-12-28 17:26:43,828] Trial 0 failed with parameters: {'n_estimators': 874, 'max_depth': 3, 'learning_rate': 0.0802182722185934, 'subsample': 0.7406021127521999, 'colsample_bytree': 0.9018974572166406, 'gamma': 0.2978557113830622, 'reg_alpha': 0.07027311587120322, 'reg_lambda': 0.21572182257505002} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\scj41115\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", 

KeyboardInterrupt: 