##### IMPORTS

In [None]:
#! pip install numpy
#! pip install pandas
#! pip install scikit-learn
#! pip install xgboost
#! pip install lightgbm

In [None]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, RandomizedSearchCV, HalvingRandomSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
import time

##### Chargement des données

In [None]:
train_df = pd.read_csv("../data/regression/train.csv")

test_df = pd.read_csv("../data/regression/test.csv")

sample_submission = pd.read_csv("../data/regression/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

##### Ajout d'un indicateur pour différencier train/test et traitement de 'co2'

In [None]:
train_df["is_train"] = 1
test_df["is_train"] = 0
test_df["co2"] = np.nan
data = pd.concat([train_df, test_df], sort=False)
data_index = data["id"]
data = data.drop("id", axis=1)

##### Traitement de la variable 'hc'

In [None]:
data['hc'] = data['hc'].fillna(data['hcnox'] - data['nox'])
data = data.drop(columns=['hcnox'])

##### Imputation : pour les numériques et les catégoriques

In [None]:
numeric_cols = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_cols = data.select_dtypes(exclude=["float64", "int64"]).columns.tolist()

for col in numeric_cols:
    data[col] = data[col].fillna(data[col].median())
for col in categorical_cols:
    data[col] = data[col].fillna("missing")

##### Encodage one-hot pour les variables catégoriques

In [None]:
data = pd.get_dummies(data, drop_first=True)

##### Séparation train/test

In [None]:
train_data = data[data["is_train"] == 1].drop("is_train", axis=1)
test_data = data[data["is_train"] == 0].drop(["is_train", "co2"], axis=1)
y_train = train_data["co2"]
X_train = train_data.drop("co2", axis=1)

print("Processed X_train shape:", X_train.shape)
print("Processed test_data shape:", test_data.shape)

##### Pour accélérer le tuning, on effectue d'abord une recherche sur un sous-échantillon

In [None]:
X_train_sample, y_train_sample = resample(X_train, y_train, n_samples=15000, random_state=42)

##### Définition d’un pipeline simple (ici les données sont déjà numériques après get_dummies)

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", ExtraTreesRegressor(random_state=42, n_jobs=-1))
])

##### Définition de la grille de recherche pour ExtraTreesRegressor

In [None]:
param_grid = {
    "model__n_estimators": np.arange(300, 500, 20).tolist(),
    "model__max_depth": [30, 35, 40, 45, 50, 55, 60, 65, 70],
    "model__min_samples_split": [2, 3, 4, 5],
    "model__min_samples_leaf": [1, 2, 3, 4, 5],
    "model__max_features": ["sqrt", "log2", 0.5, 0.75, 0.9, 0.95, 0.99, None],
    "model__bootstrap": [True, False],
    "model__criterion": ["squared_error"]
}

##### Utiliser HalvingRandomSearchCV pour une recherche rapide et efficace

In [None]:
tuner = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    factor=3,  # Diminue rapidement le nombre de candidats moins prometteurs
    scoring="neg_mean_absolute_error",  # Puisque l'objectif est de minimiser le MAE
    n_jobs=-1,
    cv=3,
    verbose=1,
    random_state=42,
    max_resources="auto"
)

##### Mesurer le temps d'exécution du tuning

In [None]:
start_time = time.time()
tuner.fit(X_train_sample, y_train_sample)
tuning_time = time.time() - start_time
print(f"Temps de tuning sur l'échantillon: {tuning_time:.2f} secondes")

##### Afficher les meilleurs hyperparamètres trouvés

In [None]:
print("Meilleurs paramètres trouvés :", tuner.best_params_)

##### Extraction des paramètres pour ExtraTreesRegressor (en retirant le préfixe "model__")

In [None]:
best_model_params = {key.replace("model__", ""): value 
                     for key, value in tuner.best_params_.items() 
                     if key.startswith("model__")}

##### Entraîner le meilleur modèle sur l'ensemble complet d'entraînement

In [None]:
final_model = ExtraTreesRegressor(n_estimators=9, random_state=17, n_jobs=-1)
final_model.fit(X_train, y_train)

##### Prédictions sur le test


In [None]:
test_preds = final_model.predict(test_data)

##### Convertir les prédictions en entiers (si nécessaire)

In [None]:
submission = pd.DataFrame({"id": test_df["id"], "co2": test_preds.astype(int)})
submission.to_csv("../result/regression/Final_ensemble_submission.csv", index=False)
print("Submission saved as Final_ensemble_submission.csv")

##### MODEL

In [None]:
# Séparer les données en ensembles d'entraînement et de test
train_data = data[data["is_train"] == 1].drop("is_train", axis=1)
test_data = data[data["is_train"] == 0].drop(["is_train", "co2"], axis=1)

# Séparer les ensembles d'entraînement en features et target
y_train = train_data["co2"]
X_train = train_data.drop("co2", axis=1)

print("Processed X_train shape:", X_train.shape)
print("Processed test_data shape:", test_data.shape)

##### TRAINING AND PREDICTION MODEL

In [None]:
print("Which model would you like to use?")
print("1. RandomForestRegressor")
print("2. GradientBoostingRegressor")
print("3. AdaBoostRegressor")
print("4. ExtraTreesRegressor")
print("5. BaggingRegressor")
print("6. VotingRegressor")
print("7. StackingRegressor")
print("8. XGBRegressor")

model = 0

while model == 0:

    model_choice = input(": ")

    match model_choice:
        case "1":
            model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "2":
            model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        case "3":
            model = AdaBoostRegressor(n_estimators=100, random_state=42)
        case "4":
            model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "5":
            model = BaggingRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "6":
            model = VotingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                    n_jobs=-1)
        case "7":
            model = StackingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                  ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                      final_estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
                                      n_jobs=-1)
        case "8":
            model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

        case _:
            model = 0
            print("Invalid choice.")

print("Model chosen:", model)

##### QUICK TEST

In [None]:
# Réduire la taille des données pour les tests rapides
X_train_sample, y_train_sample = resample(X_train, y_train, n_samples=5000, random_state=42)

# Test rapide sur un sous-échantillon
model.fit(X_train_sample, y_train_sample)
y_pred_sample = model.predict(X_train_sample)
mae_sample = mean_absolute_error(y_train_sample, y_pred_sample)

print("Quick test MAE on sample data: {:.4f}".format(mae_sample))

Results :

Random Forest Regressor :
- Cross Validation MAE: 0.14
- Time required for training: 1s

Gradient Boosting Regressor :
- Cross Validation MAE: 0.6887
- Time required for training: 6s

AdaBoost Regressor :
- Cross Validation MAE: 5.11
- Time required for training: 1m 11s

Extra Trees Regressor :
- Cross Validation MAE: 0.008 (old test, long but accurate) 0.0024 (new test, fast but less accurate)
- Time required for training: 10min 11s (old test, long but accurate) 1s (new test, fast but less accurate)

Bagging Regressor :
- Cross Validation MAE: 0.1392
- Time required for training: 45.6s

Voting Regressor :
- Cross Validation MAE: 0.4
- Time required for training: 8s

Stacking Regressor :
- Cross Validation MAE: 0.13
- Time required for training: 25m

XGBoost Regressor :
- Cross Validation MAE: 0.19
- Time required for training: 2s

LightGBM Regressor :
- Cross Validation MAE: 0.19
- Time required for training: 1s

##### PREDICTION

In [None]:
model.fit(X_train, y_train)

est_preds = model.predict(test_data)

##### SUBMISSION FILE

In [None]:
# construire le fichier de soumission, le format requis est: id,co2

print(f"Taille de sample_submission: {sample_submission.shape}")
print(f"Taille de test_preds: {test_preds.shape}")

submission = pd.DataFrame(
    {
        "id": sample_submission["id"],
        "co2": test_preds.astype(int),  # les valeurs doivent être des entiers
    }
)

model_name = str(model).split("(")[0]
submission.to_csv(f"../result/regression/{model_name}_submission.csv", index=False)
print(f"Submission saved as {model_name}_submission.csv")