# ML model Training (Question 3 of the Homework)

In [14]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# Read datasets

df_pe = pd.read_csv("../data/pe/cleaned/pe_dataset.csv")
normalized_df_pe = pd.read_csv("../data/pe/cleaned/normalized_pe_dataset.csv")
normalized_df_pe_without_outliers = pd.read_csv("../data/pe/cleaned/normalized_pe_dataset_without_outliers.csv")

In [18]:
def splitDataset(df, test_size=0.15, should_have_validation_set=True):
    from sklearn.model_selection import train_test_split
    X = df.drop(columns=['Entity', 'Year','Primary energy consumption per capita (kWh/person)'])
    y = df['Primary energy consumption per capita (kWh/person)']
    X_pretrain, X_test, y_pretrain, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=None)

    if should_have_validation_set:
        X_train, X_val, y_train, y_val = train_test_split(X_pretrain, y_pretrain, test_size=test_size/(1-test_size), shuffle=True, random_state=None)
        print(X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape)
        return X_train, X_test, X_val, y_train, y_test, y_val
    
    print(X_pretrain.shape, X_test.shape, y_pretrain.shape, y_test.shape)
    return X_pretrain, X_test, y_pretrain, y_test

## Linear Regression

As seen in the [ML algorithm selection notebook](ML_algorithm_selection.ipynb), the linear model does not have any hyperparameters to tune. Therefore, cross-validation is not required. We can split our datasets into training, testing and validation sets like so:

- Training set: 80%
- Testing set: 20%

To do that, I'll use the `train_test_split` function from `sklearn.model_selection` module. This function provides parameters to shuffle or not our dataset before splitting it. I'll set the `shuffle` parameter to `True` to ensure that the data is shuffled before splitting it. According to [scikit-learn documentation](https://scikit-learn.org/stable/glossary.html#term-random_state), by default the function will use the global random state from `np.random` module to shuffle the data. 

NOTE: I don't see the use here of having reproducible results, so I'll leave the `random_state` parameter to `None`. 

In [21]:
# Split datasets into train, test and validation sets
X_pe_train, X_pe_test, y_pe_train, y_pe_test = splitDataset(df_pe, test_size=0.2, should_have_validation_set=False)

# nor -> normalized
X_nor_pe_train, X_nor_pe_test, y_nor_pe_train, y_nor_pe_test = splitDataset(normalized_df_pe, test_size=0.2, should_have_validation_set=False)

# nor_wo -> normalized without outliers
X_nor_pe_wo_train, X_nor_pe_wo_test, y_nor_pe_wo_train, y_nor_pe_wo_test = splitDataset(normalized_df_pe_without_outliers, test_size=0.2, should_have_validation_set=False)


(2554, 4) (639, 4) (2554,) (639,)
(2554, 4) (639, 4) (2554,) (639,)
(2407, 4) (602, 4) (2407,) (602,)


Once we have our subsets, we can train our model using the training set for each dataset.

In [37]:
# Model training

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model_nor = LinearRegression()
LR_model_nor_wo = LinearRegression()

In [38]:
# Train on global dataset

LR_model.fit(X_pe_train, y_pe_train)
print(f'R^2 score: {LR_model.score(X_pe_train, y_pe_train).round(3)}')
LR_coef = pd.DataFrame(LR_model.coef_, index=X_pe_train.columns, columns=['Coefficient'])
LR_coef.loc['Intercept'] = LR_model.intercept_
LR_coef.round(3)

R^2 score: 0.55


Unnamed: 0,Coefficient
Access to electricity (% of population),-86.645
Access to clean fuels for cooking,183.81
Renewable energy share in the total final energy consumption (%),-164.559
gdp_per_capita,1.159
Intercept,12381.889


In [39]:
# Train on normalized dataset

LR_model_nor.fit(X_nor_pe_train, y_nor_pe_train)
print(f'R^2 score: {LR_model_nor.score(X_nor_pe_train, y_nor_pe_train).round(3)}')
LR_coef_nor = pd.DataFrame(LR_model_nor.coef_, index=X_nor_pe_train.columns, columns=['Coefficient'])
LR_coef_nor.loc['Intercept'] = LR_model_nor.intercept_
LR_coef_nor.round(3)

R^2 score: 0.545


Unnamed: 0,Coefficient
Access to electricity (% of population),-0.03
Access to clean fuels for cooking,0.074
Renewable energy share in the total final energy consumption (%),-0.049
gdp_per_capita,0.569
Intercept,0.038


In [40]:
# Train on normalized dataset without outliers

LR_model_nor_wo.fit(X_nor_pe_wo_train, y_nor_pe_wo_train)
print(f'R^2 score: {LR_model_nor_wo.score(X_nor_pe_wo_train, y_nor_pe_wo_train).round(3)}')
LR_coef_nor_wo = pd.DataFrame(LR_model_nor_wo.coef_, index=X_nor_pe_wo_train.columns, columns=['Coefficient'])
LR_coef_nor_wo.loc['Intercept'] = LR_model_nor_wo.intercept_
LR_coef_nor_wo.round(3)

R^2 score: 0.707


Unnamed: 0,Coefficient
Access to electricity (% of population),-0.031
Access to clean fuels for cooking,0.175
Renewable energy share in the total final energy consumption (%),-0.077
gdp_per_capita,0.816
Intercept,0.056


Steps with metrics to evaluate our model: https://www.freecodecamp.org/news/how-to-build-and-train-linear-and-logistic-regression-ml-models-in-python/