In [56]:
#Importation des librairies 
import pandas as pd 
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import category_encoders as ce
import joblib

In [57]:
# Change to the current working directory
current_workdir = os.getcwd()
os.chdir(current_workdir)
DATASET = "https://media.githubusercontent.com/media/jdalfons/M2-SISE-Enedis/refs/heads/main/datasets/DPE_Enedis_69.csv"

In [58]:
df = pd.read_csv(DATASET, sep=';', dtype={'Isolation_toiture_(0/1)': 'str'}, low_memory=False)

In [None]:
var_explicatives = ["Etiquette_DPE", 
                    "Type_bâtiment", 
                    "Année_construction", 
                    "Classe_inertie_bâtiment", 
                    "Hauteur_sous-plafond", 
                    "Surface_habitable_logement",  
                    "Type_énergie_principale_chauffage", 
                    "Isolation_toiture_(0/1)", 
                    "Code_postal_(BAN)"
                    ]

target = ["Conso_5_usages_é_finale"]

In [60]:
df_dpe = df[var_explicatives+target]

In [61]:
df_quanti = df_dpe.select_dtypes(include=['number'])

#Calcul du IQR pour chaque colonne 
Quanti_bas = df_quanti.quantile(0.025)
Quanti_haut = df_quanti.quantile(0.75)
IQR = Quanti_haut-Quanti_bas

#Filtrage des données sans outliers 
df_dpe_filtered = df_dpe[~((df_quanti < (Quanti_bas - 1.5 * IQR)) | (df_quanti > (Quanti_haut + 1.5 * IQR))).any(axis=1)]

df_dpe_filtered.describe()

Unnamed: 0,Année_construction,Hauteur_sous-plafond,Surface_habitable_logement,Code_postal_(BAN),Conso_5_usages_é_finale
count,146697.0,224547.0,224269.0,224547.0,224546.0
mean,1980.050989,2.50171,60.967855,69165.693075,8578.633043
std,24.810727,0.034569,23.632222,187.528718,5012.296495
min,1780.0,2.3,1.0,69001.0,306.3
25%,1964.0,2.5,45.1,69007.0,4765.0
50%,1976.0,2.5,63.0,69100.0,7457.3
75%,2001.0,2.5,76.7,69290.0,11461.0
max,2024.0,2.6,133.7,69720.0,26387.5


In [62]:
df['Isolation_toiture_(0/1)'] = df['Isolation_toiture_(0/1)'].replace({True: 'Oui', False: 'Non', None: 'Inconnue'})

In [63]:
df['Isolation_toiture_(0/1)']

0         Inconnue
1         Inconnue
2         Inconnue
3         Inconnue
4         Inconnue
            ...   
290819        True
290820       False
290821       False
290822       False
290823        True
Name: Isolation_toiture_(0/1), Length: 290824, dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_enc = df.copy()

df_enc['Etiquette_DPE'] = label_encoder.fit_transform(df_enc['Etiquette_DPE'])
df_enc['Type_bâtiment'] = label_encoder.fit_transform(df_enc['Type_bâtiment'])
df_enc['Classe_inertie_bâtiment'] = label_encoder.fit_transform(df_enc['Classe_inertie_bâtiment'])
df_enc['Type_énergie_principale_chauffage'] = label_encoder.fit_transform(df_enc['Type_énergie_principale_chauffage'])

# Check if 'nom_commune' exists in the dataframe before encoding
if 'nom_commune' in df_enc.columns:
	df_enc['nom_commune'] = label_encoder.fit_transform(df_enc['nom_commune'])

# df_enc['Code_postal_(BAN)'] = label_encoder.fit_transform(df_enc['Code_postal_(BAN)'])
df_enc['Isolation_toiture_(0/1)'] = label_encoder.fit_transform(df_enc['Isolation_toiture_(0/1)'])


In [None]:
# Loop through the specified columns
for col in [
    'Etiquette_DPE', 
    'Type_bâtiment', 
    'Classe_inertie_bâtiment', 
    'Type_énergie_principale_chauffage', 
    'nom_commune', 
    'Isolation_toiture_(0/1)'
]:
    # Get the value counts for the encoded dataframe
    unique_values_enc = df_enc[col].value_counts()
    
    # Get the value counts for the original dataframe
    unique_values_df = df[col].value_counts()
    
    # Create a dictionary mapping original tags to encoded values
    tag_value_dict = {tag: val for tag, val in zip(unique_values_df.index, unique_values_enc.index)}
    
    # Print the dictionary for the current column
    print(f"\n{col} Dictionary with tags from df and values from df_enc for {col}:")
    
    # Sort the dictionary by keys
    sorted_tag_value_dict = dict(sorted(tag_value_dict.items()))
    
    # Print the sorted dictionary
    print(sorted_tag_value_dict)


Etiquette_DPE Dictionary with tags from df and values from df_enc for Etiquette_DPE:
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}

Type_bâtiment Dictionary with tags from df and values from df_enc for Type_bâtiment:
{'appartement': 0, 'immeuble': 1, 'maison': 2}

Classe_inertie_bâtiment Dictionary with tags from df and values from df_enc for Classe_inertie_bâtiment:
{'Lourde': 0, 'Légère': 1, 'Moyenne': 2, 'Très lourde': 3}

Type_énergie_principale_chauffage Dictionary with tags from df and values from df_enc for Type_énergie_principale_chauffage:
{'Bois – Bûches': 5, 'Bois – Granulés (pellets) ou briquettes': 0, 'Bois – Plaquettes d’industrie': 8, 'Bois – Plaquettes forestières': 2, 'Charbon': 3, 'Fioul domestique': 12, 'GPL': 1, 'Gaz naturel': 7, 'Propane': 6, 'Réseau de Chauffage urbain': 9, 'Électricité': 10, "Électricité d'origine renouvelable utilisée dans le bâtiment": 4}

nom_commune Dictionary with tags from df and values from df_enc for nom_commune:
{'Affoux': 14

In [66]:
df_enc['Isolation_toiture_(0/1)'].unique()

array([1, 0, 2])

In [67]:
# encoded_values

In [68]:
#On enlève les NA de la conso 
df_dpe_filtered = df_dpe_filtered.dropna(subset=["Conso_5_usages_é_finale"])
df_dpe_filtered.isnull().sum()

Etiquette_DPE                             0
Type_bâtiment                             0
Année_construction                    77850
Classe_inertie_bâtiment                1292
Hauteur_sous-plafond                      0
Surface_habitable_logement              278
Type_énergie_principale_chauffage      6759
Isolation_toiture_(0/1)              103160
Code_postal_(BAN)                         0
Conso_5_usages_é_finale                   0
dtype: int64

In [69]:
#On enlève les "immeubles"
df_dpe_filtered = df_dpe_filtered[df_enc['Type_bâtiment'] != 'immeuble']
df_dpe_filtered.shape

  df_dpe_filtered = df_dpe_filtered[df_enc['Type_bâtiment'] != 'immeuble']


(224546, 10)

In [70]:
df_dpe_filtered = df_dpe_filtered.dropna(subset=target)
df_dpe_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 224546 entries, 0 to 290823
Data columns (total 10 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Etiquette_DPE                      224546 non-null  object 
 1   Type_bâtiment                      224546 non-null  object 
 2   Année_construction                 146696 non-null  float64
 3   Classe_inertie_bâtiment            223254 non-null  object 
 4   Hauteur_sous-plafond               224546 non-null  float64
 5   Surface_habitable_logement         224268 non-null  float64
 6   Type_énergie_principale_chauffage  217787 non-null  object 
 7   Isolation_toiture_(0/1)            121386 non-null  object 
 8   Code_postal_(BAN)                  224546 non-null  int64  
 9   Conso_5_usages_é_finale            224546 non-null  float64
dtypes: float64(4), int64(1), object(5)
memory usage: 18.8+ MB


In [71]:
#Séparation variables explicatives et variable cible 
X = df_enc[var_explicatives]
X = X.fillna(0)
y = df_enc[target]
y = y.fillna(0)

#Train / Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train.shape

(203576, 9)

In [72]:
#Colonnes numériques et catégorielles / traitement spécifique pour isolation toiture 
num_features = ["Année_construction", "Hauteur_sous-plafond", "Surface_habitable_logement"]
iso_feature = ["Isolation_toiture_(0/1)"]
cat_features = ["Etiquette_DPE", "Type_bâtiment", "Classe_inertie_bâtiment", "Type_énergie_principale_chauffage"]  

In [73]:
# Define the numerical transformer
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Define the categorical transformer
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', ce.OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10))
])



In [74]:
import pickle
from sklearn.metrics import r2_score


# Pipeline final avec les données encodées et imputées
pipeline.fit(X_train, y_train.values.ravel())

# Sauvegarde du pipeline complet
with open('pipeline_ml_regression.pkl', 'wb') as file:
    pickle.dump(pipeline, file)
    
# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate the R-squared value
r2 = r2_score(y_test, y_pred)
print(f"R-squared value: {r2}")

R-squared value: 0.8830648669187138


In [75]:
import joblib

# Save the pipeline to a pickle file
joblib.dump(pipeline, 'pipeline_ml_regression.pkl')

['pipeline_ml_regression.pkl']

In [76]:
import json

# Get the first row of X_train
first_row = X_train.iloc[0]

# Convert the row to a dictionary
first_row_dict = first_row.to_dict()

# Convert the dictionary to a JSON string
first_row_json = json.dumps(first_row_dict, indent=4)

print(first_row_json)

{
    "Etiquette_DPE": 3.0,
    "Type_b\u00e2timent": 0.0,
    "Ann\u00e9e_construction": 1918.0,
    "Classe_inertie_b\u00e2timent": 2.0,
    "Hauteur_sous-plafond": 2.5,
    "Surface_habitable_logement": 42.0,
    "Type_\u00e9nergie_principale_chauffage": 10.0,
    "Isolation_toiture_(0/1)": 0.0,
    "Code_postal_(BAN)": 69002.0
}


In [77]:
first_row_df = pd.DataFrame([first_row])
first_row_df

Unnamed: 0,Etiquette_DPE,Type_bâtiment,Année_construction,Classe_inertie_bâtiment,Hauteur_sous-plafond,Surface_habitable_logement,Type_énergie_principale_chauffage,Isolation_toiture_(0/1),Code_postal_(BAN)
133341,3.0,0.0,1918.0,2.0,2.5,42.0,10.0,0.0,69002.0


In [78]:
import joblib

# Load the pipeline from the pickle file
pipeline = joblib.load('pipeline_ml_regression.pkl')

# Make predictions using the loaded pipeline
y_pred = pipeline.predict(first_row_df)
print(y_pred)

[4006.08246753]


In [79]:
import pandas as pd

data = {
    "etiquette_dpe": 3.0,
    "type_batiment": 0.0,
    "annee_construction": 1921.0,
    "classe_inertie_batiment": 1.0,
    "hauteur_sous_plafond": 3.1,
    "surface_habitable_logement": 50.2,
    "type_energie_principale_chauffage": 11.0,
    "isolation_toiture": 1.0,
    "code_postal_ban": 69002.0
}

df_new = pd.DataFrame([data])
df_new


Unnamed: 0,etiquette_dpe,type_batiment,annee_construction,classe_inertie_batiment,hauteur_sous_plafond,surface_habitable_logement,type_energie_principale_chauffage,isolation_toiture,code_postal_ban
0,3.0,0.0,1921.0,1.0,3.1,50.2,11.0,1.0,69002.0


In [80]:
model = joblib.load("../models/pipeline_ml_regression.pkl")

In [82]:

del df