In [2]:
# Import data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle


data = pd.read_csv('../data/preprocessed_data.csv')

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

selected_columns = data[['Surface_habitable_logement', 'Ubat_W/m²_K', 'Etiquette_DPE', 'Type_énergie_principale_chauffage']]
y = data["Conso_5_usages_é_finale"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(selected_columns, y, test_size=0.25, random_state=42)

# Initialize the models
lin_reg = LinearRegression()
dec_tree = DecisionTreeRegressor()
rand_forest = RandomForestRegressor(n_estimators=100)
xboost = XGBRegressor()

# Train the models
lin_reg.fit(X_train, y_train)
dec_tree.fit(X_train, y_train)
rand_forest.fit(X_train, y_train)
xboost.fit(X_train, y_train)

# Make predictions
y_pred_lin_reg = lin_reg.predict(X_test)
y_pred_dec_tree = dec_tree.predict(X_test)
y_pred_rand_forest = rand_forest.predict(X_test)
xboost_pred = xboost.predict(X_test)

# Calculate le r squared
r2_lin_reg = r2_score(y_test, y_pred_lin_reg)
r2_dec_tree = r2_score(y_test, y_pred_dec_tree)
r2_rand_forest = r2_score(y_test, y_pred_rand_forest)
r2_xboost = r2_score(y_test, xboost_pred)
print("Linear Regression R^2: ", r2_lin_reg)
print("Decision Tree R^2: ", r2_dec_tree)
print("Random Forest R^2: ", r2_rand_forest)
print("XGBoost R^2: ", r2_xboost)

{'Linear Regression': {'MSE': np.float64(11868171.21289768), 'R2': 0.5338432926450076}, 'Decision Tree': {'MSE': np.float64(6710212.206684154), 'R2': 0.7364370321417497}, 'Random Forest': {'MSE': np.float64(4774280.635329491), 'R2': 0.8124763368314676}}


In [2]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

# Supposez que data soit votre DataFrame initial avec toutes les colonnes de données et la cible
# On sépare X (les caractéristiques) et y (la variable cible)
y = data["Conso_5_usages_é_finale"]
# Filtrage des colonnes pour éviter le data leakage
X = data[[col for col in data.columns if not ("Coût" in col or "Conso" in col or "GES" in col or "coût" in col)]]

# Création du sélecteur de caractéristiques pour sélectionner les 5 meilleures
selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X, y)

# Récupération des noms des caractéristiques sélectionnées
selected_features = selector.get_support(indices=True)
selected_features_names = [X.columns[i] for i in selected_features]
print("5 meilleures caractéristiques :", selected_features_names)



  f = msb / msw


5 meilleures caractéristiques : ['Ubat_W/m²_K', 'Méthode_application_DPE', 'Deperditions_planchers_bas', 'Surface_habitable_logement', 'Type_installation_chauffage']


In [None]:
import joblib
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

def train_and_save_models(data_path, target_variable):
    # Charger les données
    data = pd.read_csv(data_path)
    
    if target_variable == "Consommation Énergétique":
        X = data[['Surface_habitable_logement', 
        'Ubat_W/m²_K',
        'Etiquette_DPE', 
        'Type_énergie_principale_chauffage']]

        y = data["Conso_5_usages_é_finale"]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42
        )
        
        lin_reg = LinearRegression()
        lin_reg.fit(X_train, y_train)
        joblib.dump(lin_reg, "consommation_linear_regression_model.pkl")
        
        dec_tree = DecisionTreeRegressor()
        dec_tree.fit(X_train, y_train)
        joblib.dump(dec_tree, "consommation_arbre_de_decision_model.pkl")
        
        rand_forest = RandomForestRegressor(n_estimators=100)
        rand_forest.fit(X_train, y_train)
        joblib.dump(rand_forest, "consommation_random_forest_model.pkl")

    elif target_variable == "Étiquette DPE":
        X = data[[
            "Conso_chauffage_é_primaire",
            "Conso_5_usages_é_finale",
            "Emission_GES_5_usages_par_m²",
            "Etiquette_GES",
            "Coût_éclairage"
        ]]
        y = data["Etiquette_DPE"]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        knn = KNeighborsClassifier(n_neighbors=5)
        knn.fit(X_train, y_train)
        joblib.dump(knn, "etiquette_knn_model.pkl")
        
        dec_tree = DecisionTreeClassifier()
        dec_tree.fit(X_train, y_train)
        joblib.dump(dec_tree, "etiquette_arbre_de_decision_model.pkl")
        
        rand_forest = RandomForestClassifier(n_estimators=100)
        rand_forest.fit(X_train, y_train)
        joblib.dump(rand_forest, "etiquette_random_forest_model.pkl")

    print(f"Modèles pour {target_variable} entraînés et sauvegardés avec succès.")

In [10]:

train_and_save_models("../data/preprocessed_data.csv", "Étiquette DPE")

Modèles pour Étiquette DPE entraînés et sauvegardés avec succès.


In [11]:
# Exemple d'appel de la fonction
train_and_save_models("../data/preprocessed_data.csv", "Consommation Énergétique")


Modèles pour Consommation Énergétique entraînés et sauvegardés avec succès.
