##### IMPORTS

In [None]:
! pip install numpy
! pip install pandas
! pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

##### DATA IMPORT

In [32]:
train_df = pd.read_csv("../data/regression/train.csv")

test_df = pd.read_csv("../data/regression/test.csv")

sample_submission = pd.read_csv("../data/regression/sample_submission.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (41257, 20)
Test shape: (13753, 19)


##### DATA MERGE

In [33]:
# Merge train and test data for easier feature engineering

train_df["is_train"] = 1
test_df["is_train"] = 0
test_df["co2"] = np.nan

data = pd.concat([train_df, test_df], sort=False)

##### PREPROCESSING

In [34]:
# Prétraitement simple:
# 1. supprimer le champ id (qui ne sera pas utilisé pour l'entraînement du modèle);
# 2. encoder one-hot les variables de texte;
# 3. remplir les valeurs manquantes des variables numériques (ici, avec la médiane).


data_index = data["id"]
data = data.drop("id", axis=1)


for col in data.columns:
    if data[col].dtype in ["float64", "int64"]:
        data[col] = data[col].fillna(data[col].median())
    else:
        data[col] = data[col].fillna("missing")

data = pd.get_dummies(data, drop_first=True)  

##### MODEL

In [35]:
# Séparer les données en ensembles d'entraînement et de test
train_data = data[data["is_train"] == 1].drop("is_train", axis=1)
test_data = data[data["is_train"] == 0].drop(["is_train", "co2"], axis=1)

# Séparer les ensembles d'entraînement en features et target
y_train = train_data["co2"]
X_train = train_data.drop("co2", axis=1)

print("Processed X_train shape:", X_train.shape)
print("Processed test_data shape:", test_data.shape)

Processed X_train shape: (41257, 3912)
Processed test_data shape: (13753, 3912)


##### TRAINING AND PREDICTION MODEL

In [36]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# On doit tester plus de modèle !

##### APPLYING THE MODEL

In [None]:
cv_scores = cross_val_score(
    rf, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)
print("Cross-validation MAE: {:.2f} ± {:.2f}".format(-cv_scores.mean(), cv_scores.std()))

# entraîner le modèle sur l'ensemble d'entraînement complet
rf.fit(X_train, y_train)

Cross-validation MAE: 0.11 ± 0.01


##### PREDICTION

In [None]:
# faire des prédictions sur l'ensemble de test
test_preds = rf.predict(test_data)

##### SUBMISSION FILE

In [None]:
# construire le fichier de soumission, le format requis est: id,co2
submission = pd.DataFrame(
    {
        "id": sample_submission["id"],  # 保持与sample_submission中的id顺序一致
        "co2": test_preds.astype(int),  # 可根据需要转换为整数
    }
)

submission.to_csv("../result/regression/submission.csv", index=False)
print("Submission file saved as submission.csv")