In [9]:
import pandas as pd
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
energy = pd.read_csv('energy_cleaned.csv')

In [10]:
energy

Unnamed: 0,country,year,electricity_access_%,clean_fuels_access_%,renewable_energy_share_final,electricity_fossil_fuels_output,electricity_nuclear_output,electricity_renewables_output,electricity_low_carbon,energy_consumption_per_capita,...,renewable_energy_share_primary,gdp_growth,gdp_per_capita,density,land_area,lat,lon,land_area_category,density_category,quadrant
0,Algeria,2000,98.97310,97.10,0.43,23.84,0.0,0.05,0.209293,9746.524,...,0.053235,3.800000,1765.027146,18,2381741.0,28.033886,1.659626,large,sparse,NE
1,Algeria,2001,98.96687,97.30,0.43,24.96,0.0,0.07,0.279664,9961.640,...,0.065218,3.000000,1740.606654,18,2381741.0,28.033886,1.659626,large,sparse,NE
2,Algeria,2002,98.95306,97.80,0.51,25.94,0.0,0.06,0.230769,10180.350,...,0.051677,5.600000,1781.828908,18,2381741.0,28.033886,1.659626,large,sparse,NE
3,Algeria,2003,98.93401,98.00,0.47,27.54,0.0,0.26,0.935252,10510.461,...,0.228104,7.200000,2103.381291,18,2381741.0,28.033886,1.659626,large,sparse,NE
4,Algeria,2004,98.91208,98.20,0.44,29.14,0.0,0.25,0.850630,10759.022,...,0.206787,4.300000,2610.185422,18,2381741.0,28.033886,1.659626,large,sparse,NE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486,Uzbekistan,2016,100.00000,85.20,1.61,48.75,0.0,7.25,12.946428,16374.342,...,3.567936,5.932151,2704.677188,79,447400.0,41.377491,64.585262,medium,populated,NE
1487,Uzbekistan,2017,100.00000,84.90,1.75,49.71,0.0,8.35,14.381675,16642.676,...,3.972285,4.395275,1916.764642,79,447400.0,41.377491,64.585262,medium,populated,NE
1488,Uzbekistan,2018,100.00000,84.30,1.49,53.58,0.0,5.85,9.843514,16445.740,...,2.927033,5.354997,1597.068337,79,447400.0,41.377491,64.585262,medium,populated,NE
1489,Uzbekistan,2019,100.00000,84.60,1.57,53.64,0.0,6.47,10.763600,16212.221,...,3.197033,5.709632,1784.009816,79,447400.0,41.377491,64.585262,medium,populated,NE


From our EDA:

5 Distinct Top Loaders: <br>
1. co2_emissions_per_capita (0.48)
2. electricity_fossil_fuels_output (0.46)
3. electricity_renewables_output (0.46)
4. land_area (0.40)
5. electricity_nuclear_output (0.36)  

Strongest correlators with gdp_per_capita: <br>

* clean_fuels_access%(0.81)
* energy_consumption_per_capita (0.81)
* electricity_access_% (0.71)




Engineer New Features: <br>
electricity_fossil_fuels_consumption_ratio <br>
electricity_renewables_consumption_ratio <br>
co2_emissions_consumption_ratio <br>
land_area_with_electricity_access <br>
land_area_with_clean_fuels_access <br>




In [11]:
energy['electricity_fossil_fuels_consumption_ratio'] = energy['electricity_fossil_fuels_output'] / energy['energy_consumption_per_capita']
energy['electricity_renewables_consumption_ratio'] = energy['electricity_renewables_output'] / energy['energy_consumption_per_capita']
energy['co2_emissions_consumption_ratio'] = energy['co2_emissions_per_capita'] / energy['energy_consumption_per_capita']
energy['land_area_with_electricity_access'] = energy['land_area'] * energy['electricity_access_%']
energy['land_area_with_clean_fuels_access'] = energy['land_area'] * energy['clean_fuels_access_%']

Ordinal Encoding

In [12]:
land_area_mapping = {'small': 0, 'medium': 1, 'large': 2}
density_mapping = {'sparse': 0, 'populated': 1, 'packed': 2}
quadrant_mapping = {'NW': 0, 'NE': 1, 'SW': 2, 'SE': 3}

energy['land_area_category'] = energy['land_area_category'].map(land_area_mapping)
energy['density_category'] = energy['density_category'].map(density_mapping)
energy['quadrant'] = energy['quadrant'].map(quadrant_mapping)

Standarization

In [13]:
exclude_columns = ['land_area_category', 'density_category', 'quadrant', 'country', 'year', 'gdp_per_capita']
columns_to_scale = [col for col in energy.columns if col not in exclude_columns]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(energy[columns_to_scale])
scaled_df = pd.DataFrame(scaled_data, columns=columns_to_scale)
final_df = pd.concat([scaled_df, energy[exclude_columns].reset_index(drop=True)], axis=1)

In [14]:
final_df

Unnamed: 0,electricity_access_%,clean_fuels_access_%,renewable_energy_share_final,electricity_fossil_fuels_output,electricity_nuclear_output,electricity_renewables_output,electricity_low_carbon,energy_consumption_per_capita,energy_usage_per_gdp,co2_emissions_per_capita,...,electricity_renewables_consumption_ratio,co2_emissions_consumption_ratio,land_area_with_electricity_access,land_area_with_clean_fuels_access,land_area_category,density_category,quadrant,country,year,gdp_per_capita
0,0.211158,0.339137,-1.023181,-0.271219,-0.287447,-0.345974,-1.127643,-0.837257,-0.288450,-0.246136,...,-0.343428,-0.163452,0.544158,0.627004,2,0,1,Algeria,2000,1765.027146
1,0.210378,0.348912,-1.023181,-0.269097,-0.287447,-0.345848,-1.125334,-0.832142,-0.317675,-0.247356,...,-0.343162,-0.168877,0.544094,0.629245,2,0,1,Algeria,2001,1740.606654
2,0.208649,0.373351,-1.018321,-0.267241,-0.287447,-0.345911,-1.126938,-0.826941,-0.304391,-0.244089,...,-0.343321,-0.165485,0.543951,0.634847,2,0,1,Algeria,2002,1781.828908
3,0.206264,0.383126,-1.020751,-0.264211,-0.287447,-0.344643,-1.103820,-0.819091,-0.315018,-0.239045,...,-0.340680,-0.160422,0.543755,0.637088,2,0,1,Algeria,2003,2103.381291
4,0.203519,0.392901,-1.022573,-0.261181,-0.287447,-0.344706,-1.106597,-0.813181,-0.346900,-0.237912,...,-0.340890,-0.161668,0.543529,0.639329,2,0,1,Algeria,2004,2610.185422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486,0.339709,-0.242492,-0.951495,-0.224039,-0.287447,-0.300327,-0.709669,-0.679652,0.678634,-0.224199,...,-0.282090,-0.193946,-0.282260,-0.281607,1,1,1,Uzbekistan,2016,2704.677188
1487,0.339709,-0.257155,-0.942990,-0.222221,-0.287447,-0.293353,-0.662571,-0.673272,0.694575,-0.220452,...,-0.273827,-0.191305,-0.282260,-0.282239,1,1,1,Uzbekistan,2017,1916.764642
1488,0.339709,-0.286481,-0.958785,-0.214891,-0.287447,-0.309203,-0.811492,-0.677955,1.005424,-0.217891,...,-0.294291,-0.186908,-0.282260,-0.283502,1,1,1,Uzbekistan,2018,1597.068337
1489,0.339709,-0.271818,-0.953925,-0.214778,-0.287447,-0.305272,-0.781299,-0.683508,0.824760,-0.214197,...,-0.288213,-0.180763,-0.282260,-0.282870,1,1,1,Uzbekistan,2019,1784.009816


In [15]:
final_df.to_csv('energy_processed.csv', index=False)