In [None]:
ls_variables_explicatives = [
 'Année_construction',
 'Periode_construction',
 'Surface_habitable_logement',
 'Etiquette_DPE',
 'Deperditions_enveloppe',
 'Annee_reception_DPE',
 'Déperditions_renouvellement_air',
 'Type_énergie_n°1',
 'Deperditions_baies_vitrées',
 'Qualité_isolation_murs',
 'Déperditions_ponts_thermiques',
 'Déperditions_murs',
 'Deperditions_planchers_hauts']

In [35]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Load the data
df = pd.read_csv('../Data/data69rhone.csv')  # Assurez-vous que l'encodage est correct

# Step 2: Define the target variable and features
target = "Coût_total_5_usages"
df = df.dropna(subset=[target])

ls_variables_explicatives = [
    'Année_construction',
    'Periode_construction',
    'Surface_habitable_logement',
    'Etiquette_DPE',
    'Deperditions_enveloppe',
    'Annee_reception_DPE',
    'Déperditions_renouvellement_air',
    'Type_énergie_n°1',
    'Deperditions_baies_vitrées',
    'Qualité_isolation_murs',
    'Déperditions_ponts_thermiques',
    'Déperditions_murs',
    'Deperditions_planchers_hauts'
]

# Définir les données d'entrée et la variable cible
X = df[ls_variables_explicatives]
y = df[target]

# Séparer les colonnes numériques et catégorielles
numeric_features = df[ls_variables_explicatives].select_dtypes(include='number').columns
categorical_features = df[ls_variables_explicatives].select_dtypes(include='object').columns

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer les pipelines de transformation pour les caractéristiques numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),      # Impute les valeurs manquantes par la moyenne
    ('scaler', StandardScaler())                      # Appliquer StandardScaler pour la mise à l'échelle
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute les valeurs manquantes par la valeur la plus fréquente
    ('ordinal', OrdinalEncoder())                          # Utilise OrdinalEncoder pour encoder les variables catégorielles
])

# Step 5: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 6: Create the full pipeline with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))  # Utilisation de Random Forest pour la régression
])

# Step 7: Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Step 8: Predict and evaluate the model
y_pred = pipeline.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')

# Step 9: Save the pipeline to a .pkl file
joblib.dump(pipeline, 'random_forest_pipeline_with_scaling.pkl')
print("Pipeline saved to 'random_forest_pipeline_with_scaling.pkl'.")

# Step 10: Load the pipeline from the .pkl file
loaded_pipeline = joblib.load('random_forest_pipeline_with_scaling.pkl')
print("Pipeline loaded from 'random_forest_pipeline_with_scaling.pkl'.")

# Step 11: Use the loaded pipeline to make predictions
new_predictions = loaded_pipeline.predict(X_test)
print("Predictions from loaded pipeline:", new_predictions)


  df = pd.read_csv('../Data/data69rhone.csv')  # Assurez-vous que l'encodage est correct


Root Mean Squared Error (RMSE): 8476.77452518827
Pipeline saved to 'random_forest_pipeline_with_scaling.pkl'.
Pipeline loaded from 'random_forest_pipeline_with_scaling.pkl'.
Predictions from loaded pipeline: [ 535.37021071 1489.874      1234.87312052 ...  651.917       499.8430346
  952.675     ]


In [32]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Load the data
df = pd.read_csv('../Data/data69rhone.csv')  # Assurez-vous que l'encodage est correct

# Step 2: Define the target variable and features
target = "Coût_total_5_usages"
df = df.dropna(subset=[target])

ls_variables_explicatives = [
    'Année_construction',
    'Periode_construction',
    'Surface_habitable_logement',
    # ajoutez ou retirez des variables ici si nécessaire
    'Etiquette_DPE',
    
]

# Définir les données d'entrée et la variable cible
X = df[ls_variables_explicatives]
y = df[[target]]  # Utiliser [[]] pour garder y sous forme de DataFrame


numeric_features = df[ls_variables_explicatives].select_dtypes(include='number').columns
categorical_features  = df[ls_variables_explicatives].select_dtypes(include='object').columns



# categorical_features = 
# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Créer les pipelines de transformation pour les caractéristiques numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))  # Impute les valeurs manquantes par la moyenne
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute les valeurs manquantes par la valeur la plus fréquente 
    ('ordinal', OrdinalEncoder())  # Utilise OrdinalEncoder pour encoder les variables catégorielles
])

# Appliquer une mise à l'échelle standard à y
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)

# Step 5: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 6: Create the full pipeline with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))  # Utilisation de Random Forest pour la régression
])

# Step 7: Fit the pipeline on the training data
pipeline.fit(X_train, y_train_scaled.ravel())

# Step 8: Predict and evaluate the model
y_pred_scaled = pipeline.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()  # Inverser pour retrouver l'échelle d'origine
y_test_orig = scaler_y.inverse_transform(y_test_scaled).flatten()  # Inverser pour les valeurs réelles

mse = mean_squared_error(y_test_orig, y_pred,  squared=False)
print(f'Mean Squared Error: {mse}')

# Step 9: Save the pipeline to a .pkl file
joblib.dump({'pipeline': pipeline, 'scaler_y': scaler_y}, 'random_forest_pipeline.pkl')
print("Pipeline saved to 'random_forest_pipeline.pkl'.")

# Step 10: Load the pipeline from the .pkl file
loaded_data = joblib.load('random_forest_pipeline.pkl')
loaded_pipeline = loaded_data['pipeline']
loaded_scaler_y = loaded_data['scaler_y']
print("Pipeline loaded from 'random_forest_pipeline.pkl'.")

# Step 11: Use the loaded pipeline to make predictions
new_predictions_scaled = loaded_pipeline.predict(X_test)
new_predictions = loaded_scaler_y.inverse_transform(new_predictions_scaled.reshape(-1, 1)).flatten()
print("Predictions from loaded pipeline:", new_predictions)


  df = pd.read_csv('../Data/data69rhone.csv')  # Assurez-vous que l'encodage est correct


Mean Squared Error: 4204.436631182351
Pipeline saved to 'random_forest_pipeline.pkl'.
Pipeline loaded from 'random_forest_pipeline.pkl'.
Predictions from loaded pipeline: [ 566.91512711 1330.3605     1227.98059114 ...  657.22334545  672.20232616
  954.2107892 ]


In [20]:
y_test

Unnamed: 0,Coût_total_5_usages
255476,505.1
228422,1361.4
13619,1160.5
248793,1863.6
254847,367.0
...,...
208267,3059.0
41444,1002.8
63402,683.0
286871,458.5


In [29]:
from  sklearn.metrics  import  mean_absolute_error, r2_score
mse1 = mean_squared_error(y_test_orig, y_pred,squared=False)
mse2 = mean_squared_error(y_test, y_pred,squared=False)
print("MAE : " + str(mean_absolute_error(y_test, y_pred)))
print("R2 : " + str(r2_score(y_test, y_pred)))
print(mse1,mse2)

MAE : 279.3758306913197
R2 : -6.260637241462736
12249.998402690813 12249.998402690813




In [26]:
loaded_pipeline

In [6]:
y_test_orig

array([ 505.1, 1361.4, 1160.5, ...,  683. ,  458.5,  882.6])

In [4]:
y_pred

array([ 505.1, 1361.4, 1160.5, ...,  683. ,  458.5,  882.6])

In [5]:
y_test

Unnamed: 0,Coût_total_5_usages
255476,505.1
228422,1361.4
13619,1160.5
248793,1863.6
254847,367.0
...,...
208267,3059.0
41444,1002.8
63402,683.0
286871,458.5


In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# Step 1: Load the data
df = pd.read_csv('../Data/data69rhone.csv')  # assurez-vous que l'encodage est correct

# Step 2: Define the target variable and features
target = "Coût_total_5_usages"
df = df.dropna(subset=[target])

ls_variables_explicatives = [
    'Année_construction',
    'Periode_construction',
    'Surface_habitable_logement',
    # ajoutez ou retirez des variables ici si nécessaire
    'Etiquette_DPE',
    "Coût_total_5_usages"
]

# Définir les données d'entrée et la variable cible
X = df[ls_variables_explicatives]
y = df[target]

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define preprocessing for numeric and categorical features
numeric_features = [
    'Année_construction',
    'Surface_habitable_logement',
    "Coût_total_5_usages"
]

categorical_features = [
    'Periode_construction',
    'Etiquette_DPE'
]

# Créer les pipelines de transformation pour les caractéristiques numériques et catégorielles
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # Impute les valeurs manquantes par la moyenne
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute les valeurs manquantes par la valeur la plus fréquente 
    ('ordinal', OrdinalEncoder())  # Utilise OrdinalEncoder pour encoder les variables catégorielles
    # ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encodage one-hot pour les variables catégorielles
])

# Step 5: Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 6: Create the full pipeline with Random Forest
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=1, random_state=42))  # Utilisation de Random Forest pour la régression
])

# Step 7: Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Step 9: Save the pipeline to a .pkl file
joblib.dump(pipeline, 'random_forest_pipeline.pkl')
print("Pipeline saved to 'random_forest_pipeline.pkl'.")

# Step 10: Load the pipeline from the .pkl file
loaded_pipeline = joblib.load('random_forest_pipeline.pkl')
print("Pipeline loaded from 'random_forest_pipeline.pkl'.")

# Step 11: Use the loaded pipeline to make predictions
new_predictions = loaded_pipeline.predict(X_test)
print("Predictions from loaded pipeline:", new_predictions)


  df = pd.read_csv('../Data/data69rhone.csv')  # assurez-vous que l'encodage est correct


Mean Squared Error: 2849.987433964231
Pipeline saved to 'random_forest_pipeline.pkl'.
Pipeline loaded from 'random_forest_pipeline.pkl'.
Predictions from loaded pipeline: [ 505.1 1361.4 1160.5 ...  683.   458.5  882.6]
