##### IMPORTS

In [None]:
! pip install numpy
! pip install pandas
! pip install scikit-learn

In [61]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor, VotingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

##### DATA IMPORT

In [43]:
train_df = pd.read_csv("../data/regression/train.csv")

test_df = pd.read_csv("../data/regression/test.csv")

sample_submission = pd.read_csv("../data/regression/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (41257, 20)
Test shape: (13753, 19)


##### DATA MERGE

In [62]:
# Merge train and test data for easier feature engineering

train_df["is_train"] = 1
test_df["is_train"] = 0
test_df["co2"] = np.nan

data = pd.concat([train_df, test_df], sort=False)

##### PREPROCESSING

In [63]:
data['hc'] = data['hc'].fillna(data['hcnox']-data['nox'])
data = data.drop(columns=['hcnox'])

data_index = data["id"]
data = data.drop("id", axis=1)

#inserer code sc ici

for col in data.columns:
    if data[col].dtype in ["float64", "int64"]:
        data[col] = data[col].fillna(data[col].median())
    else:
        data[col] = data[col].fillna("missing")

data = pd.get_dummies(data, drop_first=True)  

##### MODEL

In [64]:
# Séparer les données en ensembles d'entraînement et de test
train_data = data[data["is_train"] == 1].drop("is_train", axis=1)
test_data = data[data["is_train"] == 0].drop(["is_train", "co2"], axis=1)

# Séparer les ensembles d'entraînement en features et target
y_train = train_data["co2"]
X_train = train_data.drop("co2", axis=1)

print("Processed X_train shape:", X_train.shape)
print("Processed test_data shape:", test_data.shape)

Processed X_train shape: (41257, 3911)
Processed test_data shape: (13753, 3911)


##### TRAINING AND PREDICTION MODEL

In [77]:
print("Which model would you like to use?")
print("1. RandomForestRegressor")
print("2. GradientBoostingRegressor")
print("3. AdaBoostRegressor")
print("4. ExtraTreesRegressor")
print("5. BaggingRegressor")
print("6. VotingRegressor")
print("7. StackingRegressor")

model = 0

while model == 0:

    model_choice = input(": ")

    match model_choice:
        case "1":
            model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "2":
            model = GradientBoostingRegressor(n_estimators=100, random_state=42)
        case "3":
            model = AdaBoostRegressor(n_estimators=100, random_state=42)
        case "4":
            model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "5":
            model = BaggingRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        case "6":
            model = VotingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                    n_jobs=-1)
        case "7":
            model = StackingRegressor(estimators=[("rf", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
                                                  ("gb", GradientBoostingRegressor(n_estimators=100, random_state=42))],
                                      final_estimator=RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
                                      n_jobs=-1)
        case _:
            model = 0
            print("Invalid choice.")

print("Model chosen:", model)

Which model would you like to use?
1. RandomForestRegressor
2. GradientBoostingRegressor
3. AdaBoostRegressor
4. ExtraTreesRegressor
5. BaggingRegressor
6. VotingRegressor
7. StackingRegressor
Model chosen: BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=42)


##### QUICK TEST

In [None]:
# Réduire la taille des données pour les tests rapides
X_train_sample, y_train_sample = resample(X_train, y_train, n_samples=5000, random_state=42)

# Test rapide sur un sous-échantillon
model.fit(X_train_sample, y_train_sample)
y_pred_sample = model.predict(X_train_sample)
mae_sample = mean_absolute_error(y_train_sample, y_pred_sample)

print("Quick test MAE on sample data: {:.4f}".format(mae_sample))

Quick test MAE on sample data: 0.0026


Results :

Random Forest Regressor :
- Cross Validation MAE: 0.14
- Time required for training: 1s

Gradient Boosting Regressor :
- Cross Validation MAE: 0.6887
- Time required for training: 6s

AdaBoost Regressor :
- Cross Validation MAE: 5.11
- Time required for training: 1m 11s

Extra Trees Regressor :
- Cross Validation MAE: 0.008
- Time required for training: 10min 11s

Bagging Regressor :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Voting Regressor :
- Cross Validation MAE: 0.0000
- Time required for training: 0.0000

Stacking Regressor :
- Cross Validation MAE: 0.13
- Time required for training: 25m

##### PREDICTION

In [74]:
# faire des prédictions sur l'ensemble de test

model.fit(X_train, y_train)

test_preds = model.predict(test_data)

##### SUBMISSION FILE

In [76]:
# construire le fichier de soumission, le format requis est: id,co2
submission = pd.DataFrame(
    {
        "id": sample_submission["id"],
        "co2": test_preds.astype(int),  # les valeurs doivent être des entiers
    }
)

model_name = str(model).split("(")[0]
submission.to_csv(f"../result/regression/{model_name}_submission.csv", index=False)
print(f"Submission saved as {model_name}_submission.csv")

Submission saved as ExtraTreesRegressor_submission.csv
