# Simple models benchmark 

## Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import os

## Loading data

We are here taking the datasets preprocessed following the methods implemented in 'preprocessing.ipynb' notebook.


In [2]:
repository_path = r'C:\\Users\\hugot\\Documents\\Code\\EL_project' #change for your own path

In [3]:
# After downloading the X_train/X_test/Y_train .csv files in your working directory:
X_train_path = os.path.join(repository_path, 'datasets\\X_train_preprocessed.csv')
X_train = pd.read_csv(X_train_path)
X_train_ids = X_train["ID"] #We keep the IDs in a separate variable
X_train = X_train.drop("ID", axis = 1) #We remove the IDs from the training set

Y_train_path = os.path.join(repository_path, 'datasets\\y_train.csv')
Y_train = pd.read_csv(Y_train_path)
Y_train_ids = Y_train["ID"]
Y_train = Y_train.drop("ID", axis = 1)

X_test_path = os.path.join(repository_path, 'datasets\\X_test_preprocessed.csv')
X_test = pd.read_csv(X_test_path)
X_test_ids = X_test["ID"]
X_test = X_test.drop("ID", axis = 1)



In [4]:
X_train.head()

Unnamed: 0,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,DE_GAS,FR_GAS,...,COUNTRY_FR,energy_wind,energy_solar,weather_impact,net_energy_ratio,cross_feature,FR_temp_impact,DE_temp_impact,REN_NON,carbon_intensity
0,0.210099,-0.427458,-0.606523,0.606523,-0.306899,0.69286,0.306899,-0.69286,0.441238,-0.213766,...,True,-3.764056,-2.921715,-0.115632,-2.183893,-2.259287,-0.490862,-0.858971,13.05661,-2.132838
1,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,1.130838,0.174773,0.42694,...,True,-67.963185,-3.226242,0.621845,0.000556,-4.051804,0.827788,0.41502,-9.337436,0.596555
2,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,1.682587,2.351913,2.122241,...,True,0.309049,-0.18752,-0.244135,-1.324031,0.058913,2.093502,2.079919,-0.176691,0.379201
3,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,-0.56323,0.487818,0.194659,...,False,0.507879,-0.662913,-0.670238,0.381427,-0.384711,-1.266712,-0.632386,-1.572342,-0.335867
4,0.143807,-0.617038,-0.92499,0.92499,-0.306899,0.990324,0.306899,-0.990324,0.238693,-0.240862,...,True,-7.566776,-1.17268,0.337934,-1.02964,0.466701,0.112457,0.758146,-0.308411,-3.743874


In [5]:
Y_train.head()

Unnamed: 0,TARGET
0,0.028313
1,-0.112516
2,-0.18084
3,-0.260356
4,-0.071733


## Some utils functions

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test):
    """
    Evaluate a regression model and return MSE, MAE, and R-squared.
    
    Parameters:
    - model: A trained regression model from Scikit-learn.
    - X_test: Test features.
    - y_test: True target values.

    Returns:
    - mse: Mean Squared Error.
    - mae: Mean Absolute Error.
    - r2: R-squared.
    """
    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, mae, r2

def plot_predictions(model, X_test, Y_test):
    """
    Plot the predictions of a regression model.
    
    Parameters:
    - y_test: True target values.
    - y_pred: Predicted target values.
    """
    # Predict on test set
    y_pred = model.predict(X_test)

    plt.figure(figsize=(10, 6))

    # Plot true values
    plt.scatter(Y_test, Y_test, color = "b", label = "True values")

    # Plot predicted values
    plt.scatter(Y_test, y_pred, color = "r", label = "Predicted values")

    plt.title('True vs Predicted Values')
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.legend()
    plt.show()

## Defining the models

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

#I'm not able to run XGBoost so I commented it
#from xgboost import XGBRegressor

def get_model(model_wanted: str, X_train, Y_train, parameters = None) -> object:

    if model_wanted == "linear_regression": #provided in by ENS
        lr = LinearRegression()
        lr.fit(X_train, Y_train)
        return lr
    
    elif model_wanted == "random_forest": #2nd lecture
        rf = RandomForestRegressor(**parameters)
        rf.fit(X_train, Y_train)
        return rf
    
    elif model_wanted == "gradient_boosting": #parameters to tune
        gb = GradientBoostingRegressor(**parameters)
        gb.fit(X_train, Y_train)
        return gb

    
    else:
        raise ValueError("Invalid model name")


## Benchmarking

In [8]:
#Defining the model to test. Template : {"model_name" : {"parameter1" : value1, "parameter2" : value2, ...}}
#If the model has no parameters to tune, the value is None
models_to_test = {"linear_regression" : None, 
                  "random_forest" : {"n_estimators" : 100, "random_state" :777}, 
                  "gradient_boosting" : {"n_estimators" :100, "random_state" :777}, 
                  }


In [9]:
import time

results = []
for model_wanted, params in models_to_test.items():
    start_time = time.time()
    model = get_model(model_wanted, X_train, Y_train, params)
    training_time = time.time() - start_time
    
    mse, mae, r2 = evaluate_model(model, X_train, Y_train)
    results.append({"Model": model_wanted, "MSE": mse, "MAE": mae, "R-squared": r2, "Training Time (s)": training_time})

    #Then use the model for predictions
    Y_test = model.predict(X_test)
    #We create a dataframe with the predictions
    Y_test = pd.concat([X_test_ids, pd.DataFrame(Y_test)], axis = 1)
    #Save the results in a csv file
    path = os.path.join(repository_path, f"results\\y_test_simple_{model_wanted}.csv")
    Y_test.to_csv(path, index = False)


results_df = pd.DataFrame(results)


  rf.fit(X_train, Y_train)
  y = column_or_1d(y, warn=True)


In [10]:
results_df

Unnamed: 0,Model,MSE,MAE,R-squared,Training Time (s)
0,linear_regression,1.005881,0.57114,0.059612,0.02899
1,random_forest,0.172495,0.24422,0.838737,5.472537
2,gradient_boosting,0.563034,0.458647,0.473624,1.32908
