# Modelisation

1) Dans un premier temps, vous allez créer un jeu de données comportant seulement les observations qui
n’ont pas de valeur manquante. Il reste 2768 observations.

In [1]:

import pandas as pd
import numpy as np
# Lecture des données
## Charger les données ou les lire directement en précisant le chemin
path="/Users/hugoguilbot/VALDOM/INSA/Machine learning/Projet/global-data-on-sustainable-energy.csv"
energy=pd.read_csv(path,sep=",",header=0)
# Vérification du contenu

all_columns = energy.columns.tolist()

title_Density = all_columns[17]
energy[title_Density] = pd.to_numeric(energy[title_Density], errors='coerce')

energy['Year'] = energy['Year'].astype(str)

In [2]:
missing_values = energy.isnull().sum()

# Dropping specified variables with significant missing values
columns_to_drop = [
    'Renewable-electricity-generating-capacity-per-capita', 
    'Financial flows to developing countries (US $)', 
    'Renewables (% equivalent primary energy)'
]
df_cleaned = energy.drop(columns=columns_to_drop)

# Creating a dataset with only the observations that have no missing values
df_no_missing = df_cleaned.dropna()

# Checking the number of remaining observations to match the requirement
remaining_observations = len(df_no_missing)

remaining_observations

2768

2) Divisez ce jeu de données en un échantillon d’apprentissage et un échantillon test. Vous prendrez un
pourcentage de 20% pour l’échantillon test. Pourquoi cette étape est-elle nécessaire lorsque nous nous
concentrons sur les performances des algorithmes ?

In [3]:
df_cleaned.dropna(inplace=True)

In [4]:
df_cleaned.shape

(2768, 18)

In [5]:
from sklearn.preprocessing import OneHotEncoder
categorical_vars = ["Entity", "Year"]

# instantiate the one hot encoder
one_hot_encoder = OneHotEncoder(sparse=False, drop = "first")

In [6]:
# apply the one hot encoder logic 
encoder_vars_array = one_hot_encoder.fit_transform(df_cleaned[categorical_vars])

# create object for the feature names using the categorical variables
encoder_feature_names = one_hot_encoder.get_feature_names_out(categorical_vars)

# create a dataframe to hold the one hot encoded variables
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)

# concatenate the new dataframe back to the original input variables dataframe
df_cleaned_new = pd.concat([df_cleaned.reset_index(drop=True), encoder_vars_df.reset_index(drop=True)], axis = 1)

# drop the original input 2 and input 3 as it is not needed anymore
df_cleaned_new.drop(categorical_vars, axis = 1, inplace = True)



In [7]:
df_cleaned_new.columns

Index(['Access to electricity (% of population)',
       'Access to clean fuels for cooking',
       'Renewable energy share in the total final energy consumption (%)',
       'Electricity from fossil fuels (TWh)', 'Electricity from nuclear (TWh)',
       'Electricity from renewables (TWh)',
       'Low-carbon electricity (% electricity)',
       'Primary energy consumption per capita (kWh/person)',
       'Energy intensity level of primary energy (MJ/$2017 PPP GDP)',
       'Value_co2_emissions_kt_by_country',
       ...
       'Year_2010', 'Year_2011', 'Year_2012', 'Year_2013', 'Year_2014',
       'Year_2015', 'Year_2016', 'Year_2017', 'Year_2018', 'Year_2019'],
      dtype='object', length=177)

In [8]:
df_cleaned_new.dtypes

Access to electricity (% of population)                             float64
Access to clean fuels for cooking                                   float64
Renewable energy share in the total final energy consumption (%)    float64
Electricity from fossil fuels (TWh)                                 float64
Electricity from nuclear (TWh)                                      float64
                                                                     ...   
Year_2015                                                           float64
Year_2016                                                           float64
Year_2017                                                           float64
Year_2018                                                           float64
Year_2019                                                           float64
Length: 177, dtype: object

In [9]:
from sklearn.model_selection import train_test_split

#definition de la cible
target = df_cleaned_new['Value_co2_emissions_kt_by_country']

# Supposons que 'Value_co2_emissions_kt_by_country' est la variable cible
df_cleaned_new.drop(columns=['Value_co2_emissions_kt_by_country'], inplace=True)


# Division du jeu de données
x_train, x_test, y_train, y_test = train_test_split(df_cleaned_new, target, test_size=0.2, random_state=2024, shuffle=True)

# X_train et y_train sont les échantillons d'apprentissage
# X_test et y_test sont les échantillons de test


In [10]:
y_train.shape

(2214,)

#### Standardization

In [11]:
from sklearn.preprocessing import StandardScaler  
# standardisation
scaler = StandardScaler()
scaler.fit(x_train)


df_train_scaled = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)

df_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

df_train_scaled.head()


Unnamed: 0,Access to electricity (% of population),Access to clean fuels for cooking,Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),Low-carbon electricity (% electricity),Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),gdp_growth,...,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019
0,0.710138,0.80836,0.345488,-0.053095,-0.018221,3.159472,1.382715,-0.254322,-0.444138,0.281251,...,-0.231914,-0.23083,-0.228651,-0.224242,-0.232993,-0.234068,-0.231914,-0.228651,-0.228651,-0.236208
1,0.756417,0.984464,-0.763567,0.750083,1.111264,1.192819,0.031436,0.623102,-0.645034,-0.330781,...,-0.231914,-0.23083,-0.228651,-0.224242,4.291975,-0.234068,-0.231914,-0.228651,-0.228651,-0.236208
2,0.487528,-0.998627,0.407625,-0.200375,-0.196016,-0.239654,0.404731,-0.533735,-0.372586,0.0823,...,-0.231914,-0.23083,-0.228651,-0.224242,-0.232993,-0.234068,-0.231914,-0.228651,-0.228651,-0.236208
3,0.725007,0.591419,-0.925794,0.417635,-0.045543,0.221968,-0.574065,-0.164452,-0.59825,-0.100873,...,-0.231914,-0.23083,-0.228651,-0.224242,-0.232993,4.272256,-0.231914,-0.228651,-0.228651,-0.236208
4,0.730834,0.858129,-0.599661,-0.196259,-0.196016,-0.236921,-0.652618,-0.312828,-0.697321,0.457616,...,-0.231914,-0.23083,-0.228651,-0.224242,-0.232993,-0.234068,-0.231914,-0.228651,-0.228651,-0.236208


3) Comparez les performances d’un modèle de régression linéaire avec/sans sélection de variables avec/sans
pénalisation, d’un SVM, d’un arbre optimal, d’une forêt aléatoire, du boosting, et de réseaux de neurones.
Justifiez vos choix (par exemple le noyau pour le SVM), et ajustez soigneusement les paramètres (par
validation croisée). Interprétez les résultats et quantifiez l’amélioration éventuelle apportée par les modèles
non linéaires.

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error


# Linear Regression without variable selection or penalization
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)

# Linear Regression with RFE (Recursive Feature Elimination) for variable selection
selector = RFE(lr, n_features_to_select=5, step=1)
selector = selector.fit(x_train, y_train)
y_pred_rfe = selector.predict(x_test)
mse_rfe = mean_squared_error(y_test, y_pred_rfe)

# Ridge Regression (L2 penalization)
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)
y_pred_ridge = ridge.predict(x_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Lasso Regression (L1 penalization)
lasso = Lasso()
lasso.fit(x_train, y_train)
y_pred_lasso = lasso.predict(x_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# Comparing Mean Squared Error (MSE) of each model
mse_scores = {
    'Linear Regression': mse_lr,
    'Linear Regression with RFE': mse_rfe,
    'Ridge Regression': mse_ridge,
    'Lasso Regression': mse_lasso
}

mse_scores


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(


{'Linear Regression': 1944014314.740266,
 'Linear Regression with RFE': 39415658254.95263,
 'Ridge Regression': 2515066863.5183563,
 'Lasso Regression': 1945349036.8780408}

In [13]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

# Grille de valeurs du paramètre alpha à optimiser pour la régression Lasso
param_grid = {"alpha": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1]}

# Initialisation de GridSearchCV avec le modèle Lasso
lasso_search = GridSearchCV(linear_model.Lasso(), param_grid, cv=5, n_jobs=-1)

# Entraînement avec GridSearchCV pour trouver les meilleurs paramètres
lasso_opt = lasso_search.fit(x_train, y_train)  # Assurez-vous que X_train, y_train sont définis

# Affichage des meilleurs paramètres et du meilleur score R2
best_alpha = lasso_opt.best_params_["alpha"]
best_score = lasso_opt.best_score_

print(f"Meilleur R2 = {best_score}, Meilleur paramètre = {best_alpha}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Meilleur R2 = 0.9968214198792257, Meilleur paramètre = 0.05


  model = cd_fast.enet_coordinate_descent(


In [14]:
from sklearn.metrics import r2_score

prev=lasso_opt.predict(x_test)
print("MSE=",mean_squared_error(prev,y_test))
print("R2=",r2_score(y_test,prev))


MSE= 1944068501.332707
R2= 0.997268904310444


In [15]:
import plotly.express as px

fig=px.scatter(x=prev,y=y_test,labels={'x': 'CO2 prédit' , 'y': 'CO2 observée'})
fig.show()

#plt.plot(prev,Yr_test,"o")
#plt.xlabel(u"O3 Prédite")
#plt.ylabel("O3 observee")
#plt.show()