In [1]:
# Import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle


data = pd.read_csv('../data/preprocessed_data.csv')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

#Variables explicatives à stocker 
selected_columns = data[['Type_bâtiment', 
'Qualité_isolation_enveloppe', 
'Etiquette_GES', 
'Surface_habitable_logement', 
'Etiquette_DPE', 
'Type_installation_chauffage', 
'Ubat_W/m²_K', 
'Qualité_isolation_murs', 
'Type_énergie_n°1', 
'Qualité_isolation_plancher_bas', 
'Méthode_application_DPE', 
'Qualité_isolation_menuiseries']]
y = data["Conso_5_usages_é_finale"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(selected_columns, y, test_size=0.25, random_state=42)

# Initialize the models
lin_reg = LinearRegression()
dec_tree = DecisionTreeRegressor()
rand_forest = RandomForestRegressor(n_estimators=100)
xboost = XGBRegressor()

# Train the models
lin_reg.fit(X_train, y_train)
dec_tree.fit(X_train, y_train)
rand_forest.fit(X_train, y_train)
xboost.fit(X_train, y_train)

# Make predictions
y_pred_lin_reg = lin_reg.predict(X_test)
y_pred_dec_tree = dec_tree.predict(X_test)
y_pred_rand_forest = rand_forest.predict(X_test)
xboost_pred = xboost.predict(X_test)

# Calculate le r squared
r2_lin_reg = r2_score(y_test, y_pred_lin_reg)
r2_dec_tree = r2_score(y_test, y_pred_dec_tree)
r2_rand_forest = r2_score(y_test, y_pred_rand_forest)
r2_xboost = r2_score(y_test, xboost_pred)
print("Linear Regression R^2: ", r2_lin_reg)
print("Decision Tree R^2: ", r2_dec_tree)
print("Random Forest R^2: ", r2_rand_forest)
print("XGBoost R^2: ", r2_xboost)

In [2]:
import joblib
import pandas as pd
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def train_and_save_models(data_path, target_variable):
    # Charger les données
    data = pd.read_csv(data_path)
    
    if target_variable == "Consommation Énergétique":
        X = data[[
            'Type_bâtiment', 
            'Qualité_isolation_enveloppe', 
            'Etiquette_GES', 
            'Surface_habitable_logement', 
            'Etiquette_DPE', 
            'Type_installation_chauffage', 
            'Ubat_W/m²_K', 
            'Qualité_isolation_murs', 
            'Type_énergie_n°1', 
            'Qualité_isolation_plancher_bas', 
            'Méthode_application_DPE', 
            'Qualité_isolation_menuiseries'
        ]]
        y = data["Conso_5_usages_é_finale"]
        
        # Encoder les variables catégorielles
        le = LabelEncoder()
        for column in X.select_dtypes(include=['object']):
            X[column] = le.fit_transform(X[column])
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42
        )
        
        xgboost_model = XGBRegressor(random_state=42)
        xgboost_model.fit(X_train, y_train)
        joblib.dump(xgboost_model, "consommation_xgboost_model.pkl")
        
        dec_tree = DecisionTreeRegressor(random_state=42)
        dec_tree.fit(X_train, y_train)
        joblib.dump(dec_tree, "consommation_arbre_de_decision_model.pkl")
        
        rand_forest = RandomForestRegressor(n_estimators=100, random_state=42)
        rand_forest.fit(X_train, y_train)
        joblib.dump(rand_forest, "consommation_random_forest_model.pkl")
    
    elif target_variable == "Étiquette DPE":
        X = data[
            [
                "Conso_chauffage_é_primaire",
                "Conso_5_usages_é_finale",
                "Emission_GES_5_usages_par_m²",
                "Etiquette_GES",
                "Coût_éclairage"
            ]
        ]
        y = data["Etiquette_DPE"]
        
        # Encoder les variables catégorielles
        le = LabelEncoder()
        for column in X.select_dtypes(include=['object']):
            X[column] = le.fit_transform(X[column])
        y = le.fit_transform(y)
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(X_train, y_train)
        joblib.dump(knn, "etiquette_knn_model.pkl")
        
        dec_tree = DecisionTreeClassifier(random_state=42)
        dec_tree.fit(X_train, y_train)
        joblib.dump(dec_tree, "etiquette_arbre_de_decision_model.pkl")
        
        rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
        rand_forest.fit(X_train, y_train)
        joblib.dump(rand_forest, "etiquette_random_forest_model.pkl")
    
    print(f"Modèles pour {target_variable} entraînés et sauvegardés avec succès.")


In [None]:

# train_and_save_models("../data/preprocessed_data.csv", "Étiquette DPE")

In [3]:
# Exemple d'appel de la fonction
train_and_save_models("../data/preprocessed_data.csv", "Consommation Énergétique")


Modèles pour Consommation Énergétique entraînés et sauvegardés avec succès.
