# Packages

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from prettytable import PrettyTable
import joblib
import os

In [2]:
# sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from scipy.stats import mode
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Initialisation

In [4]:
naive_sub= pd.read_parquet('taxi/naive_submission.parquet')
df= pd.read_parquet('taxi/train.parquet')
test= pd.read_parquet('taxi/test.parquet')

# Préparation des données

In [5]:
columns = list(df.columns)
# Variable qualitative ou quantitative
quantitative_vars = ['passenger_count','trip_distance', 'PU_location_lat', 'PU_location_lon', 'DO_location_lat', 'DO_location_lon', 'fare_amount', 'tolls_amount', 'extra']
qualitative_vars = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'RatecodeID', 'store_and_fwd_flag', 'payment_type', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge', 'Airport_fee']
variable_cible = 'tip_amount'


# Supprime observation nombre de passagers = 8 de nos données
# Nous définissons un nouveau dataframe sur lequel nous faisons des transformations
df_filtered = df.copy()
index_8 = df_filtered[df_filtered['passenger_count']==7].index
df_filtered = df_filtered.drop(index_8, axis=0)

# Convertir en numérique les colonnes qui devraient l'être
for col in quantitative_vars:
    df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')

for col in qualitative_vars:
    df_filtered[col] = df_filtered[col].astype('category')

# Variable binaire
for col in ['store_and_fwd_flag','VendorID', 'mta_tax', 'congestion_surcharge', 'Airport_fee']:
    df_filtered[col] = pd.factorize(df_filtered[col])[0] + 1

# Variable improvement surcharge
df_filtered['improvement_surcharge'] = pd.factorize(df_filtered['improvement_surcharge'])[0] + 1

# RatecodeID en entiers
df_filtered['RatecodeID'] = df_filtered['RatecodeID'].astype(int)

# Convertir les autres colonnes en type "category" pour les colonnes catégorielles
for col in qualitative_vars:
    df_filtered[col] = df_filtered[col].astype('category')

# Convertion en format data heure
df_filtered['tpep_pickup_datetime'] = pd.to_datetime(df_filtered['tpep_pickup_datetime'])
df_filtered['tpep_dropoff_datetime'] = pd.to_datetime(df_filtered['tpep_dropoff_datetime'])

df_filtered = df_filtered.reset_index(drop=True)

# Feature Engineering: À Lancer!!

In [8]:
def time_of_day(hour):
    if 4 <= hour < 12:
        return 0
    elif 12 <= hour < 18:
        return 1
    else:
        return 2
    
# Clusterisation des quartiers/zones pour point de départ et arrivée
# Appliquer KMeans pour 10 clusters
kmeans_pickup = KMeans(n_clusters=10, random_state=42)
kmeans_pickup.fit_predict(df_filtered[['PU_location_lat', 'PU_location_lon']])

kmeans_dropoff = KMeans(n_clusters=10, random_state=42)
kmeans_dropoff.fit_predict(df_filtered[['DO_location_lat', 'DO_location_lon']])

# Transformation du nouveau dataframe
def transform_df_fe(df, km_pickup, km_dropoff):
    # Supprime cette observation de nos données
    # Nous définissons un nouveau dataframe sur lequel nous faisons des transformations
    df_filtered = df.copy()

    # Convertir en numérique les colonnes qui devraient l'être
    for col in quantitative_vars:
        df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')

    # Variable catégorielles
    for col in ['store_and_fwd_flag','VendorID', 'mta_tax', 'congestion_surcharge', 'Airport_fee', 'improvement_surcharge']:
        to_map = np.unique(df_filtered[col])
        df_filtered[col] = df_filtered[col].map({to_map[0]: 0, to_map[1]: 1})

    # RatecodeID en entiers
    df_filtered['RatecodeID'] = df_filtered['RatecodeID'].astype(int)

    # Convertir les autres colonnes en type "category" pour les colonnes catégorielles
    for col in qualitative_vars:
        df_filtered[col] = df_filtered[col].astype('category')

    # Feauter engineering
    # Convertion en format data heure
    df_filtered['tpep_pickup_datetime'] = pd.to_datetime(df_filtered['tpep_pickup_datetime'])
    df_filtered['tpep_dropoff_datetime'] = pd.to_datetime(df_filtered['tpep_dropoff_datetime'])

    df_fe = pd.DataFrame()
    # Calculer le temps de trajet + nouvelle variable trajet moyen
    df_fe['trip_duration'] = (df_filtered['tpep_dropoff_datetime'] - df_filtered['tpep_pickup_datetime']).dt.total_seconds() / 60

    # df_fe['average_speed'] = df_filtered['trip_distance']/df_fe['trip_duration']
    df_fe['average_speed'] = np.where(df_fe['trip_duration'] == 0, 0, df_filtered['trip_distance'] / df_fe['trip_duration'])
    
    df_fe = df_fe.drop(['trip_duration'], axis=1)

    # Time of day
    df_fe['time_of_day'] = df_filtered['tpep_pickup_datetime'].dt.hour.apply(time_of_day)

    # Creer une variable binaire indiquant si le trajet a eu lieu pendant le weekend (vendredi compris)
    df_fe['is_weekend'] = df_filtered['tpep_pickup_datetime'].dt.dayofweek.apply(lambda x: True if x >= 5 else False)

    df_fe['pickup_cluster'] = km_pickup.predict(df_filtered[['PU_location_lat', 'PU_location_lon']])
    df_fe['dropoff_cluster'] = km_dropoff.predict(df_filtered[['DO_location_lat', 'DO_location_lon']])

    # Frais supplementaire
    df_fe[['extra', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge', 'Airport_fee']] = df_filtered[['extra', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge', 'Airport_fee']].apply(pd.to_numeric, errors='coerce')
    df_fe['total_extra_fees'] = df_fe[['extra', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge', 'Airport_fee']].sum(axis=1)
    df_fe = df_fe.drop(['extra', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge', 'Airport_fee'], axis=1)

    # to category
    for col in ['time_of_day', 'pickup_cluster', 'dropoff_cluster']:
        df_fe[col] = df_fe[col].astype('category')

    # Enlever les variables inutiles
    to_remove = ['PU_location_lat', 'PU_location_lon', 'DO_location_lat', 'DO_location_lon', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'mta_tax', 'improvement_surcharge', 'congestion_surcharge','store_and_fwd_flag']
    df_filtered = df_filtered.drop(to_remove, axis=1)

    df_filtered = df_filtered.join(df_fe)

    return df_filtered

# Optimisation des hyperparamètre pour une sélection de modèles

In [12]:
# Different models

# Return the model with the best performance in cross-validation after grid search (hyperparameters that achieve best cv score)
def tune_hyperparameters(model, method, param_grid, X_train, y_train, cv_folds, scoring):
    gs_model = RandomizedSearchCV(model, param_grid, cv=cv_folds, scoring=scoring, n_jobs=-1, verbose=1)
    gs_model.fit(X_train, y_train)
    return gs_model.best_estimator_, gs_model.best_params_

models = {
    "XGBoost": XGBRegressor(enable_categorical=True),
    "CatBoost": CatBoostRegressor(verbose=0, cat_features=['time_of_day', 'pickup_cluster', 'dropoff_cluster', 'VendorID', 'RatecodeID','Airport_fee']),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "LightGBM": LGBMRegressor(verbose=0)
}

# Hyperparameters to compare during cv
hyperparameter_grids = {
    "XGBoost": {
        'n_estimators': [100, 300, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 6, 10, 15],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.01, 0.1, 1],
        'reg_lambda': [0.5, 1, 1.5, 2]
    },
    
    "CatBoost": {
        'iterations': [200, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'depth': [4, 6, 8, 10],
        'l2_leaf_reg': [1, 3, 5, 7],
        'random_strength': [1, 2, 5, 10],
        'bagging_temperature': [0, 1, 2, 5],
        'border_count': [32, 64, 128]
    },
    
    "Random Forest Regressor": {
        'n_estimators': [100, 300, 500, 1000],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10, 15],
        'min_samples_leaf': [1, 2, 4, 10],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True]
    },
    
    "Gradient Boosting": {
        'n_estimators': [100, 300, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 5, 10, 15],
        'min_samples_leaf': [1, 2, 4, 10]
    },
    
    "LightGBM": {
        'n_estimators': [100, 300, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [-1, 10, 20, 30],
        'num_leaves': [31, 50, 100, 150],
        'min_child_samples': [5, 10, 20, 30],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 1, 10],
        'reg_lambda': [0.5, 1, 1.5, 2],
        'verbose': [-1]
    }
}

Entrainement uniquement sur les données avec méthode de paiement carte de crédit et avec un RatecodeID différent de 99

In [13]:
### Optimisation sur les donnees d'entrainement pour lequel payment_type=1 (carte de credit)
df_train_fe = df_filtered[df_filtered['payment_type']==1]
df_train_fe = df_train_fe[df_train_fe['RatecodeID']!=99]

X_fe = df_train_fe.drop(['tip_amount'],axis=1)
y_fe = df_train_fe['tip_amount']

X_fe = transform_df_fe(X_fe, kmeans_pickup, kmeans_dropoff)

X_fe = X_fe.drop('payment_type', axis=1)

### Entraînement: objectif MSE

In [14]:
cv_folds = 5

tuned_models = {}
best_params = {}

for method,model in models.items():
    print(method)
    tuned_model, params = tune_hyperparameters(model, method, hyperparameter_grids[method], X_fe, y_fe, cv_folds, scoring='neg_mean_squared_error')
    tuned_models[method] = tuned_model
    best_params[method] = params

XGBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CatBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Regressor
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Gradient Boosting
Fitting 5 folds for each of 10 candidates, totalling 50 fits
LightGBM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


### Entraînement: objectif R2

In [15]:
cv_folds = 5

tuned_models_r2 = {}
best_params_r2 = {}

for method,model in models.items():
    print(method)
    tuned_model, params = tune_hyperparameters(model, method, hyperparameter_grids[method], X_fe, y_fe, cv_folds, scoring='r2')
    tuned_models_r2[method] = tuned_model
    best_params_r2[method] = params

XGBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CatBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Random Forest Regressor
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Gradient Boosting
Fitting 5 folds for each of 10 candidates, totalling 50 fits
LightGBM
Fitting 5 folds for each of 10 candidates, totalling 50 fits


## Prédiction d'ensemble par moyenne

In [16]:
def predict_tip_amount_ensemble(data, models):
    # Initaliser un vecteur des predictions a 0
    tip_predictions = np.zeros(len(data))
    
    # Predire uniquement les observations avec methode de paiment carte de credit
    payment_type_1 = data[(data['payment_type'] == 1) & (data['RatecodeID'] != 99)]
    payment_type_1 = payment_type_1.drop('payment_type', axis=1)

    if not payment_type_1.empty:
        predictions_ensemble = []

        for method in models:
            print(method)
            pred = models[method].predict(payment_type_1)
            predictions_ensemble.append(pred)

        predictions_ensemble = np.array(predictions_ensemble).astype(float).T
        mean_predictions = np.mean(predictions_ensemble, axis=1)

        tip_predictions[(data['payment_type'] == 1) & (data['RatecodeID'] != 99)] = mean_predictions
    return tip_predictions

In [17]:
new_test = transform_df_fe(test, kmeans_pickup, kmeans_dropoff)

In [18]:
predictions = predict_tip_amount_ensemble(new_test, tuned_models)
predictions_r2 = predict_tip_amount_ensemble(new_test, tuned_models_r2)

XGBoost
CatBoost
Random Forest Regressor
Gradient Boosting
LightGBM
XGBoost
CatBoost
Random Forest Regressor
Gradient Boosting
LightGBM


Sauvegarder les résultats

In [19]:
# results = pd.DataFrame()
# # results['Unnamed: 0'] = np.arange(len(test))
# results['row_ID'] = np.arange(len(test))
# results['tip_amount'] = predictions_r2
# results
# results.to_csv('results_taxi2.csv', index=False)

## Performance sur les données d'entraînement

In [20]:
new_train = transform_df_fe(df, kmeans_pickup, kmeans_dropoff)
new_y = new_train['tip_amount']
new_train = new_train.drop('tip_amount',axis=1)

predictions_train = predict_tip_amount_ensemble(new_train, tuned_models)
print('Mean squared Error:', mean_squared_error(new_y, predictions_train))
print('R2 score:', r2_score(new_y, predictions_train))

XGBoost
CatBoost
Random Forest Regressor
Gradient Boosting
LightGBM
Mean squared Error: 3.7392715988629317
R2 score: 0.7248503274222702


# Sauvergarder les modèles pour utilisation ultérieure

In [23]:
def save_all_models(tuned_models, save_path="models/"):
    os.makedirs(save_path, exist_ok=True)
    
    for method, model in tuned_models.items():
        # nom du fichier

        filename = f"{save_path}{method.replace(' ', '_')}_tuned_model.joblib"
        
        # save modèle
        joblib.dump(model, filename)

        # Message
        print(f"Modèle '{method}' sauvegardé: '{filename}'")

In [24]:
save_all_models(tuned_models)

Modèle 'XGBoost' sauvegardé: 'models/XGBoost_tuned_model.joblib'
Modèle 'CatBoost' sauvegardé: 'models/CatBoost_tuned_model.joblib'
Modèle 'Random Forest Regressor' sauvegardé: 'models/Random_Forest_Regressor_tuned_model.joblib'
Modèle 'Gradient Boosting' sauvegardé: 'models/Gradient_Boosting_tuned_model.joblib'
Modèle 'LightGBM' sauvegardé: 'models/LightGBM_tuned_model.joblib'


## Importer modèles

In [25]:
# model = joblib.load("tuned_model.joblib")