# Projet 4 : Anticipez les besoins en consommation de bâtiments
Date début : 12/12/2024

# Importation des librairies utiles

In [1]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import colormaps
from matplotlib import colors
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
%matplotlib inline

import seaborn as sns

import math

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from scipy.stats import chi2_contingency
import statsmodels.formula.api as smf
import statsmodels.api as sm

import csv

from scipy import stats

import missingno as msno

In [2]:
from sklearn import linear_model

In [3]:
from sklearn.svm import SVR

In [4]:
from sklearn.ensemble import RandomForestRegressor

In [5]:
from sklearn.ensemble import GradientBoostingRegressor

In [6]:
from sklearn.metrics import r2_score

# Prétraitement des données

In [7]:
def normalize_y(y):
    y_norm = np.log10(y)
    sigma_y_norm = np.std(y_norm)
    mu_y_norm = np.mean(y_norm)
    return y_norm, sigma_y_norm, mu_y_norm

In [8]:
def std_x(X_train, X_test):
    std_scale = StandardScaler().fit(X_train)
    X_train_std = std_scale.transform(X_train)
    X_test_std = std_scale.transform(X_test)
    return X_train_std, X_test_std

# Prédiction

In [9]:
def calcul_predict(X, predictor, mode=None, s_int=None):
    if(mode=='norm'):
        y = np.power(10,predictor.predict(X))
    elif(mode=='int'):
        y = predictor.predict(X)*s_int
    else:
        y = predictor.predict(X)
    return y

In [10]:
def prediction_reglinear(X_train, y_train, X_test, mode=None, s_int_train=None, s_int_test=None):
    lr = linear_model.LinearRegression()
    lr.fit(X_train,y_train)
    y_pred_train = calcul_predict(X_train, lr, mode, s_int_train)
    y_pred_test = calcul_predict(X_test, lr, mode, s_int_test)
    return y_pred_train, y_pred_test, lr

In [11]:
def prediction_clf(model, X_train, y_train, X_test, param_grid, score, mode=None, s_int_train=None, s_int_test=None):
    clf = model_selection.GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring=score
    )
    clf.fit(X_train, y_train)
    y_pred_train = calcul_predict(X_train, clf, mode, s_int_train)
    y_pred_test = calcul_predict(X_test, clf, mode, s_int_test)
    return y_pred_train, y_pred_test, clf

In [12]:
def prediction_L(X_train, y_train, X_test, param_grid_dict, score, mode=None, s_int_train=None, s_int_test=None):
    Y_test = pd.DataFrame()
    Y_train = pd.DataFrame()
    clf_dict={}

    Y_train['baseline'], Y_test['baseline'], lr = prediction_reglinear(X_train, y_train, X_test, mode, s_int_train, s_int_test)

    Y_train['ridge'], Y_test['ridge'], clf_dict['ridge'] = \
        prediction_clf(linear_model.Ridge(), X_train, y_train, X_test, param_grid_dict['ridge'], score, mode, s_int_train, s_int_test)

    Y_train['lasso'], Y_test['lasso'], clf_dict['lasso'] = \
        prediction_clf(linear_model.Lasso(), X_train, y_train, X_test, param_grid_dict['lasso'], score, mode, s_int_train, s_int_test)

    Y_train['EN'], Y_test['EN'], clf_dict['EN'] = \
        prediction_clf(linear_model.ElasticNet(), X_train, y_train, X_test, param_grid_dict['EN'], score, mode, s_int_train, s_int_test)

    return Y_train, Y_test, lr, clf_dict

In [13]:
def prediction_NL(X_train, y_train, X_test, param_grid_dict, score, Y_test, Y_train, clf_dict, mode=None, s_int_train=None, s_int_test=None):
    Y_train['SVR'], Y_test['SVR'], clf_dict['SVR'] = \
        prediction_clf(SVR(tol=10, max_iter=2000), X_train, y_train, X_test, param_grid_dict['SVR'], score, mode, s_int_train, s_int_test)

    Y_train['RFR'], Y_test['RFR'], clf_dict['RFR'] = \
        prediction_clf(RandomForestRegressor(), X_train, y_train, X_test, param_grid_dict['RFR'], score, mode, s_int_train, s_int_test)

    Y_train['GBR'], Y_test['GBR'], clf_dict['GBR'] = \
        prediction_clf(GradientBoostingRegressor(), X_train, y_train, X_test, param_grid_dict['GBR'], score, mode, s_int_train, s_int_test)

    return Y_train, Y_test, clf_dict

In [14]:
def prediction_tot(X_train, y_train, X_test, param_grid_dict, score, mode=None, s_int_train=None, s_int_test=None):
    Y_train, Y_test, lr, clf_dict = prediction_L(X_train, y_train, X_test, param_grid_dict, score, mode, s_int_train, s_int_test)
    Y_train, Y_test, clf_dict = prediction_NL(X_train, y_train, X_test, param_grid_dict, score, Y_test, Y_train, clf_dict,
                                              mode, s_int_train, s_int_test)
    return Y_train, Y_test, lr, clf_dict

# Calcul des indicateurs

In [15]:
def calcul_sqrt_RMSE(name_models, category_names, Y_dict, y_test):
    model_std = pd.DataFrame(data=name_models, columns=['Model'])
    for string in category_names:
        nb_nan = len(name_models)-len(Y_dict[string].columns)
        model_std[string] = np.concatenate((np.sqrt([np.mean((Y_dict[string][model] - y_test) ** 2) for model in Y_dict[string].columns]),
                                           np.full(nb_nan, np.nan)))
    return model_std

In [16]:
def calcul_r2(name_models, category_names, Y_dict, y_test, y_train):
    model_r2 = pd.DataFrame(data=name_models, columns=['Model'])
    for string in category_names:
        nb_nan = len(name_models)-len(Y_dict[string].columns)
        if(string[-5:]=='train'):
            y=y_train
        else:
            y=y_test
        model_r2[string] = np.concatenate(([r2_score(y, Y_dict[string][model]) for model in Y_dict[string].columns],
                                           np.full(nb_nan, np.nan)))
    return model_r2

In [141]:
def create_Y_error(Y, y):
    Y_error = pd.DataFrame()
    for string in Y.columns:
        Y_error[string] = (Y[string]-y)
    return Y_error

In [139]:
def create_Y_error_tot(Y_dict, y):
    Y_error_dict = {}
    categories = list(Y_dict.keys())
    for string in categories:
        Y_error_dict[string] = create_Y_error(Y_dict[string], y)
    return Y_error_dict

In [21]:
def calcul_quantile_df(quantile_prop, Y_dict):
    q_df = pd.DataFrame(index=Y_dict[list(Y_dict.keys())[0]].columns)
    for string in list(Y_dict.keys()):
        q_df[string] = Y_dict[string].quantile(quantile_prop).values
    return q_df

In [22]:
def calcul_quantile_tot(Y_dict):
    q1_df = calcul_quantile_df(0.25, Y_dict)
    q3_df = calcul_quantile_df(0.75, Y_dict)
    median_df = calcul_quantile_df(0.5, Y_dict)
    d1_df = calcul_quantile_df(0.1, Y_dict)
    d9_df = calcul_quantile_df(0.9, Y_dict)
    return q1_df, q3_df, median_df, d1_df, d9_df

# Importance des features

In [17]:
def importance_features_lin(idx_max, feature_importances_df, importance_name, lr_reg, clf_lin_names, clf_dict, features):
    idx = np.argsort(np.abs(lr_reg.coef_))[::-1]
    features[idx][0:16]
    feature_importances_df['baseline'] = feature_importances_df['feature'].isin(features[idx][0:16])
    feature_importances_df[importance_name] = feature_importances_df['baseline']
    for model in clf_lin_names:
        idx = np.argsort(np.abs(clf_dict[model].best_estimator_.coef_))[::-1]
        features[idx][0:idx_max]
        feature_importances_df[model] = feature_importances_df['feature'].isin(features[idx][0:idx_max])
        feature_importances_df[importance_name] = feature_importances_df[[importance_name, model]].sum(axis=1)

In [18]:
def importance_features_nonlin(feature_importances_df, importance_name, clf_nonlin_names, clf_dict, features):
    for model in clf_nonlin_names:
        features_i = clf_dict[model].best_estimator_.feature_importances_
        idx = np.argsort(features_i)[::-1]
        
        cs = np.cumsum(features_i[idx])
        idx_max = next(x[0] for x in enumerate(cs) if x[1] > 0.95)+1
        
        feature_importances_df[model] = feature_importances_df['feature'].isin(features[idx][0:idx_max])
    feature_importances_df[importance_name] = feature_importances_df[clf_nonlin_names].sum(axis=1)

# Représentations

In [23]:
def plot_quantiles(q1_df, q3_df, d1_df, d9_df, median_df):
    categories = q1_df.columns
    x_dict = {}

    bar_width=0.2

    fig = plt.figure(figsize=(15,5))

    ax=plt.subplot(1,2,1)
    i = -len(categories)%2
    for string in categories:
        x=np.arange(len(q1_df.index))+i*bar_width
        ax.bar(x=x, height=q3_df[string]-q1_df[string], bottom=q1_df[string], width=bar_width, label=string)
        ax.scatter(x=x, y=median_df[string], color='k', marker='d')
        i+=1
    ax.axhline(0, color='r', linestyle=':')
    ax.set_xticks(x, q1_df.index)
    ax.set_axisbelow(True)
    ax.grid()
    ax.set_title('Quartiles')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

    ax=plt.subplot(1,2,2)
    i = -len(categories)%2
    for string in categories:
        x=np.arange(len(d1_df.index))+i*bar_width
        ax.bar(x=x, height=d9_df[string]-d1_df[string], bottom=d1_df[string], width=bar_width, label=string)
        ax.scatter(x=x, y=median_df[string], color='k', marker='d')
        i+=1
    ax.axhline(0, color='r', linestyle=':')
    ax.set_xticks(x, q1_df.index)
    ax.set_axisbelow(True)
    ax.grid()
    ax.set_title('Déciles')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

In [1]:
def plot_boxplot(q1_df, q3_df, d1_df, d9_df, median_df, ax):
    categories = q1_df.columns
    
    bar_width=0.2

    i = 0
    for string in categories:
        x=np.arange(len(q1_df.index))+(i-len(categories)%2)*bar_width
        ax.bar(x=x, height=q3_df[string]-q1_df[string], bottom=q1_df[string], width=bar_width,
               color=list(colors.TABLEAU_COLORS.values())[i], label=string+'_quartiles')
        ax.scatter(x=x, y=median_df[string], color='k', marker='d')
        ax.scatter(x=x, y=d1_df[string], marker='+', color=list(colors.TABLEAU_COLORS.values())[i], label=string+'_deciles')
        ax.scatter(x=x, y=d9_df[string], marker='+', color=list(colors.TABLEAU_COLORS.values())[i])
        i+=1
    ax.axhline(0, color='r', linestyle=':')
    ax.set_xticks(np.arange(len(q1_df.index)), q1_df.index)
    ax.set_axisbelow(True)
    ax.grid()
    ax.set_title('Quartiles et déciles des erreurs')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

In [4]:
def plot_RMSE_boxplot(y, std_df, study_list, q1_df, q3_df, d1_df, d9_df, median_df):
    fig = plt.figure(figsize=(15,5))

    ax=plt.subplot(1,2,1)
    std_df.plot(ax=ax, kind='bar', x='Model', y=study_list, rot=0)
    ax.axhline(np.nanmin(std_df[study_list].values), color='r', linestyle=':')
    ax.axhline(np.std(y), color='k', linestyle=':')
    ax.set_title('RMSE')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

    ax=plt.subplot(1,2,2)
    plot_boxplot(q1_df, q3_df, d1_df, d9_df, median_df, ax)

In [3]:
def plot_r2(r2_df, study_list, train_list, ax):
    bar_width = 0.2
    r2_df.plot(ax=ax, kind='bar', x='Model', y=study_list, width=len(train_list)*bar_width, rot=0)
    i=0
    for string in train_list:
        ax.scatter(x=np.arange(len(r2_df['Model']))+(i-len(train_list)%2)*bar_width, y=r2_df[string],
                   marker='d', color=list(colors.TABLEAU_COLORS.values())[i], edgecolor='k', label=string)
        i+=1
    ax.axhline(np.max(r2_df[study_list].values), color='r', linestyle=':')
    ax.set_title('r2 Modèle complet et réduit')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)

In [25]:
def plot_RMSE_R2(std_df, r2_df, study_list, train_list):
    fig = plt.figure(figsize=(15,5))

    ax=plt.subplot(1,2,1)
    std_df.plot(ax=ax, kind='bar', x='Model', y=study_list, rot=0)
    ax.axhline(np.min(std_df[study_list].values), color='r', linestyle=':')
    ax.set_title('RMSE')
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3)
    
    ax=plt.subplot(1,2,2)
    plot_r2(r2_df, study_list, train_list, ax)

In [26]:
def plot_compare(stats_df, compare_list1, compare_list2, title1, title2):
    fig = plt.figure(figsize=(15,5))
    
    ax=plt.subplot(1,2,1)
    stats_df.plot(ax=ax, kind='barh', x='Model', y=compare_list1)
    ax.set_title(title1)
    
    ax=plt.subplot(1,2,2)
    stats_df.plot(ax=ax, kind='barh', x='Model', y=compare_list2)
    ax.set_title(title2)