##### IMPORTS

In [None]:
! pip install numpy
! pip install pandas
! pip install scikit-learn

In [42]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

##### DATA IMPORT

In [43]:
train_df = pd.read_csv("../data/regression/train.csv")

test_df = pd.read_csv("../data/regression/test.csv")

sample_submission = pd.read_csv("../data/regression/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (41257, 20)
Test shape: (13753, 19)


##### DATA MERGE

In [44]:
# Merge train and test data for easier feature engineering

train_df["is_train"] = 1
test_df["is_train"] = 0
test_df["co2"] = np.nan

data = pd.concat([train_df, test_df], sort=False)

##### PREPROCESSING

In [45]:
# Prétraitement simple:
# 1. supprimer le champ id (qui ne sera pas utilisé pour l'entraînement du modèle);
# 2. encoder one-hot les variables de texte;
# 3. remplir les valeurs manquantes des variables numériques (ici, avec la médiane).


data_index = data["id"]
data = data.drop("id", axis=1)


for col in data.columns:
    if data[col].dtype in ["float64", "int64"]:
        data[col] = data[col].fillna(data[col].median())
    else:
        data[col] = data[col].fillna("missing")

data = pd.get_dummies(data, drop_first=True)  

##### MODEL

In [46]:
# Séparer les données en ensembles d'entraînement et de test
train_data = data[data["is_train"] == 1].drop("is_train", axis=1)
test_data = data[data["is_train"] == 0].drop(["is_train", "co2"], axis=1)

# Séparer les ensembles d'entraînement en features et target
y_train = train_data["co2"]
X_train = train_data.drop("co2", axis=1)

print("Processed X_train shape:", X_train.shape)
print("Processed test_data shape:", test_data.shape)

Processed X_train shape: (41257, 3912)
Processed test_data shape: (13753, 3912)


##### TRAINING AND PREDICTION MODEL

In [54]:
print("Which model would you like to use?")
print("1. RandomForestRegressor")
print("2. GradientBoostingRegressor")
print("3. AdaBoostRegressor")
print("4. ExtraTreesRegressor")
print("5. BaggingRegressor")
print("6. VotingRegressor")
print("7. StackingRegressor")

model = 0

while model == 0:

    model_choice = input(": ")

    match model_choice:
        case "1":
            model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "2":
            model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        case "3":
            model = AdaBoostRegressor(n_estimators=100, random_state=42)
        case "4":
            model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "5":
            model = BaggingRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "6":
            model = VotingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                    n_jobs=-1)
        case "7":
            model = StackingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                  ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                      final_estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
                                      n_jobs=-1)
        case _:
            model = 0
            print("Invalid choice.")

print("Model chosen:", model)

Which model would you like to use?
1. RandomForestRegressor
2. GradientBoostingRegressor
3. AdaBoostRegressor
4. ExtraTreesRegressor
5. BaggingRegressor
6. VotingRegressor
7. StackingRegressor
Model chosen: RandomForestRegressor(n_jobs=-1, random_state=42)


##### APPLYING THE MODEL

In [55]:
cv_scores = cross_val_score(
    model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)
print("Cross-validation MAE: {:.2f} ± {:.2f}".format(-cv_scores.mean(), cv_scores.std()))

model.fit(X_train, y_train)

Cross-validation MAE: 0.11 ± 0.01


Results :

Random Forest Generator :
- Cross Validation MAE: 0.11
- Time required for training: 1m 52s

Gradient Boosting Generator :
- Cross Validation MAE: 0.76
- Time required for training: 9m 18s

AdaBoost Generator :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Extra Trees Generator :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Bagging Generator :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Voting Regressor :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Stacking Regressor :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

##### PREDICTION

In [49]:
# faire des prédictions sur l'ensemble de test
test_preds = model.predict(test_data)

##### SUBMISSION FILE

In [53]:
# construire le fichier de soumission, le format requis est: id,co2
submission = pd.DataFrame(
    {
        "id": sample_submission["id"],
        "co2": test_preds.astype(int),  # les valeurs doivent être des entiers
    }
)

model_name = str(model).split("(")[0]
submission.to_csv(f"../result/regression/{model_name}_submission.csv", index=False)
print(f"Submission saved as {model_choice}_submission.csv")

Submission saved as 2_submission.csv
