In [114]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib


In [115]:
df = pd.read_csv('../Data/data69rhone.csv') 

  df = pd.read_csv('../Data/data69rhone.csv')


In [116]:
target = "Coût_total_5_usages"
df = df.dropna(subset=[target])

In [117]:
# Calcul des quartiles et de l'IQR
q1 = df[target].quantile(0.25)
q3 = df[target].quantile(0.75)
iqr = q3 - q1

# Filtrer le DataFrame pour conserver uniquement les valeurs dans la plage des limites
df = df[(df[target] >= (q1 - 1.5 * iqr)) & (df[target] <= (q3 + 1.5 * iqr))]


In [118]:
ls_variables_explicatives = [
    'Periode_construction',
    'Surface_habitable_logement',
    'Etiquette_DPE',
    'Deperditions_enveloppe',
    'Annee_reception_DPE',
    'Déperditions_renouvellement_air',
    'Type_énergie_n°1',
    'Deperditions_baies_vitrées',
    'Qualité_isolation_murs',
    'Déperditions_ponts_thermiques',
    'Déperditions_murs',
    'Deperditions_planchers_hauts'
]


In [123]:
for  i  in  ls_variables_explicatives:
    print(i)
    print(df[i].unique())
    

Periode_construction
['Avant 1960' nan '2001 - 2010' '1991 - 2000' '1981 - 1990' '1961 - 1970'
 'Après 2010']
Surface_habitable_logement
[ 27.   41.5  40.1 ... 456.  231.5 178.4]
Etiquette_DPE
['G' 'F' 'D' 'C' 'E' 'B' 'A']
Deperditions_enveloppe
[1.131e+02 1.871e+02 1.856e+02 ... 3.000e-01 2.000e-01 6.525e+02]
Annee_reception_DPE
[2024 2023 2022 2021]
Déperditions_renouvellement_air
[ 14.8  52.6  53.4 ... 225.5 439.8 183.9]
Type_énergie_n°1
['Électricité' 'Gaz naturel' 'Charbon' 'Bois – Bûches'
 'Réseau de Chauffage urbain' 'Bois – Granulés (pellets) ou briquettes'
 'Fioul domestique'
 "Électricité d'origine renouvelable utilisée dans le bâtiment"
 'Bois – Plaquettes d’industrie' 'GPL' 'Bois – Plaquettes forestières'
 'Propane']
Deperditions_baies_vitrées
[ 16.1  35.8  21.7 ... 313.9 235.4 167.4]
Qualité_isolation_murs
['insuffisante' 'bonne' 'très bonne' 'moyenne' nan]
Déperditions_ponts_thermiques
[  8.   10.5   8.8 ... 329.  264.1 119.5]
Déperditions_murs
[ 57.3  85.1  82.3 ... 206.

In [103]:
# Définir les données d'entrée et la variable cible
X = df[ls_variables_explicatives]
y = df[target]

# Séparer les colonnes numériques et catégorielles
numeric_features = df[ls_variables_explicatives].select_dtypes(include='number').columns
categorical_features = df[ls_variables_explicatives].select_dtypes(include='object').columns


In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
df[target]

0         1360.0
1         2195.5
2         1684.8
3         1369.4
4          544.6
           ...  
287160     690.0
287161     606.0
287162     417.0
287163     528.8
287164     678.0
Name: Coût_total_5_usages, Length: 270348, dtype: float64

In [106]:
# Créer les pipelines de transformation pour les caractéristiques numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),      # Impute les valeurs manquantes par la moyenne
    # ('scaler', StandardScaler())                      # Appliquer StandardScaler pour la mise à l'échelle
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute les valeurs manquantes par la valeur la plus fréquente
    ('ordinal', OrdinalEncoder())                          # Utilise OrdinalEncoder pour encoder les variables catégorielles
])

# Step 5: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 6: Create the full pipeline with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=5, random_state=42))  # Utilisation de Random Forest pour la régression
])


In [107]:
pipeline.fit(X_train, y_train)

In [108]:
y_pred = pipeline.predict(X_test)
y_pred

array([ 540.74, 1186.94, 1924.34, ...,  145.9 ,  492.14,  610.52])

In [109]:
y_test

116584     779.4
119020    1233.0
57802     2340.3
181955    2037.8
50633      665.2
           ...  
260036     608.3
74478      312.6
19286      145.9
283419     598.3
274899     704.6
Name: Coût_total_5_usages, Length: 54070, dtype: float64

In [110]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 124.47660841217254




In [111]:
joblib.dump(pipeline, 'random_forest_regressor.pkl')
print("Pipeline saved to 'random_forest_regressor.pkl'.")

Pipeline saved to 'random_forest_regressor.pkl'.


In [112]:
loaded_pipeline = joblib.load('random_forest_regressor.pkl')
print("Pipeline loaded from 'random_forest_regressor.pkl'.")

Pipeline loaded from 'random_forest_regressor.pkl'.


In [113]:
new_predictions = loaded_pipeline.predict(X_test)
print("Predictions from loaded pipeline:", new_predictions)

Predictions from loaded pipeline: [ 540.74 1186.94 1924.34 ...  145.9   492.14  610.52]
