In [1]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import xgboost as xgb
import gc
import sys
import os
sys.path.append(os.path.abspath("../../.."))


from Preprocessing.preprocessing_pipeline_impute import preprocessing_pipeline
from Preprocessing.imputation import get_imputation_maps, apply_imputation,ContextImputer
from Preprocessing.preprocessing_pipeline_segment import preprocessing_pipeline_segment
from Preprocessing.split_new import split_data
from utils.eval_call import evaluate_model

In [2]:

X_train, X_test, y_train, y_test, cat_feats, num_feats = split_data('../../../data.csv')


In [3]:
from sklearn.model_selection import KFold, cross_validate
import numpy as np


numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_feats),
    ('cat', categorical_transformer, cat_feats)
])


xgb_pipeline = Pipeline([
    ('imp_fc', ContextImputer('fuel_consumption_l_100km')),
    ('imp_ps', ContextImputer('power_ps')),
    ('imp_er', ContextImputer('electric_range')),
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=2
    ))
])


print("Performing k-fold cross-validation...")
k_folds = 5
cv = KFold(n_splits=k_folds, shuffle=True, random_state=42)


scoring = {
    'mae': 'neg_mean_absolute_error',
    'mse': 'neg_mean_squared_error',
    'r2': 'r2'
}

cv_results = cross_validate(xgb_pipeline, X_train, y_train, cv=cv, 
                           scoring=scoring, verbose=1)


mae_scores = -cv_results['test_mae']
mse_scores = -cv_results['test_mse']
rmse_scores = np.sqrt(mse_scores)
r2_scores = cv_results['test_r2']


print(f"Cross-validation MAE scores: {mae_scores}")
print(f"Mean MAE: {mae_scores.mean():.2f}, Std: {mae_scores.std():.2f}")


print("\nDetailed metrics:")
print(f"MAE: {mae_scores.mean():.2f}")
print(f"MSE: {mse_scores.mean():.2f}")
print(f"RMSE: {rmse_scores.mean():.2f}")
print(f"R²: {r2_scores.mean():.4f}")


print("\nTraining final XGBoost model on all training data...")
xgb_pipeline.fit(X_train, y_train)


y_pred = xgb_pipeline.predict(X_test)
print("\nEvaluation on test set:")
evaluate_model(y_test, y_pred, "XGBoost")

Performing k-fold cross-validation...
Cross-validation MAE scores: [4104.68505859 4145.60986328 4169.390625   4043.01855469 4106.71582031]
Mean MAE: 4113.88, Std: 43.00

Detailed metrics:
MAE: 4113.88
MSE: 203924598.40
RMSE: 14139.01
R²: 0.8351

Training final XGBoost model on all training data...

Evaluation on test set:
XGBoost Performance Metrics:
MAE: 4209.02
MSE: 656042112.00
RMSE: 25613.32
R²: 0.69
------------------------------
