In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_parquet('../../data_parquet/train_data.parquet')
df_copy = df.copy()

In [3]:
df_test = pd.read_parquet('../../data_parquet/test_data.parquet')
df_test_copy = df_test.copy()

In [4]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168000 entries, 0 to 167999
Data columns (total 38 columns):
 #   Column                                       Non-Null Count   Dtype         
---  ------                                       --------------   -----         
 0   time_hourly                                  168000 non-null  datetime64[ns]
 1   generation_biomass                           168000 non-null  int64         
 2   generation_fossil_brown_coal_lignite         168000 non-null  int64         
 3   generation_fossil_coal_derived_gas           168000 non-null  int64         
 4   generation_fossil_gas                        168000 non-null  int64         
 5   generation_fossil_hard_coal                  168000 non-null  int64         
 6   generation_fossil_oil                        168000 non-null  int64         
 7   generation_fossil_oil_shale                  168000 non-null  int64         
 8   generation_fossil_peat                       168000 non-null  in

**Predicción base para la demanda energética**

In [5]:
# Guardamos variable externa 'time_hourly' y variable objetivos 'total_load_actual'
y = df_copy['total_load_actual'].values
time_hourly = df_copy['time_hourly'].values
df_copy.drop(['time_hourly','total_load_actual'], axis = 1, inplace=True)

In [4]:
# Normalizacion de variables numéricas
def normalize_numerics_values(df_copy):
    numeric_columns = df_copy.select_dtypes(include='number')
    scaler = StandardScaler()
    df_copy[numeric_columns.columns] = scaler.fit_transform(numeric_columns)
    return df_copy

In [5]:
# Codificacion variables categóricas
def encoder_category_values(df_copy):
    categorical_columns = df_copy.select_dtypes(include='object')
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_data = encoder.fit_transform(categorical_columns)

    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns.columns))
    encoded_df.index = df_copy.index
    df_copy = pd.concat([df_copy.drop(categorical_columns.columns, axis=1), encoded_df], axis=1)
    return df_copy

In [8]:
df_copy = normalize_numerics_values(df_copy)
df_copy = encoder_category_values(df_copy)

In [9]:
# Separamos en validacion y entrenamiento
X_train, X_val, y_train, y_val = train_test_split(df_copy, y, test_size=0.2, random_state=123)

In [10]:
# Modelado base para predicción de demanda energética
load_model_baseline = LinearRegression(n_jobs=-1)
load_model_baseline.fit(X_train, y_train)

In [11]:
# predicción demanda
pred_load = load_model_baseline.predict(X_val)

In [12]:
score_load = r2_score(y_val, pred_load)
print('La evaluación de la baseline es de:',round(score_load, 2))

La evaluación de la baseline es de: 0.88


In [13]:
# Prediccion con datos no vistos (conjunto de prueba/test)
y_test = df_test_copy['total_load_actual'].values
df_test_copy.drop(['time_hourly', 'total_load_actual'], axis = 1, inplace=True)
df_test_copy = normalize_numerics_values(df_test_copy)
df_test_copy = encoder_category_values(df_test_copy)

In [21]:
# Añadimos variables entrenadas faltantes en el conjunto de prueba (añade todo como 0)
df_test_copy[['city_name_Bilbao', 'city_name_Madrid', 'city_name_Valencia', 'weather_main_dust', 
              'weather_main_haze', 'weather_main_snow', 'weather_icon_13', 
              'weather_icon_13d', 'weather_icon_13n']]=0
column_order = df_copy.columns
df_test_copy = df_test_copy[column_order]

In [22]:
pred_load_test = load_model_baseline.predict(df_test_copy)# prediccion con datos de prueba

In [23]:
# Evaluacion de baseline con datos de prueba demanda energética
score_load_test = r2_score(y_test, pred_load_test)
print('La Baseline para el modelo de predicción de la demanda energética es de:',round(score_load_test, 2))

La Baseline para el modelo de predicción de la demanda energética es de: -2.197709314782246e+17


In [24]:
# características mas importantes para demanda energética a través de random forest
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_train, y_train)

In [25]:
# Características mas representativas en el conjunto de datos
importances = zip(rfr.feature_importances_, X_train.columns)

sorted_importances = sorted(importances, key=lambda x: x[0], reverse=True)

for score, name in sorted_importances:
    print(f'{name}: {round(score, 2)}')

generation_hydro_pumped_storage_consumption: 0.39
generation_solar: 0.11
generation_fossil_gas: 0.1
generation_wind_onshore: 0.09
generation_fossil_oil: 0.07
generation_hydro_water_reservoir: 0.07
generation_hydro_run_of_river_and_poundage: 0.03
generation_fossil_hard_coal: 0.03
generation_nuclear: 0.03
price_actual: 0.02
generation_waste: 0.01
generation_fossil_brown_coal_lignite: 0.01
generation_other: 0.01
generation_biomass: 0.01
generation_other_renewable: 0.01
pressure: 0.0
temp: 0.0
humidity: 0.0
temp_min: 0.0
temp_max: 0.0
wind_deg: 0.0
weather_icon_03d: 0.0
wind_speed: 0.0
weather_icon_01n: 0.0
weather_id: 0.0
clouds_all: 0.0
weather_icon_03n: 0.0
weather_icon_01d: 0.0
weather_main_clear: 0.0
weather_icon_10d: 0.0
rain_1h: 0.0
weather_icon_10n: 0.0
weather_icon_02: 0.0
weather_icon_01: 0.0
weather_icon_02d: 0.0
weather_icon_02n: 0.0
weather_icon_03: 0.0
city_name_Madrid: 0.0
weather_icon_10: 0.0
weather_icon_04d: 0.0
weather_main_rain: 0.0
weather_main_clouds: 0.0
weather_icon

**Predicción base para el precio de la energía**

In [6]:
# Guardamos variable externa 'time_hourly' y variable objetivos 'price_actual'
y = df_copy['price_actual'].values
time_hourly = df_copy['time_hourly'].values
df_copy.drop(['time_hourly','price_actual'], axis = 1, inplace=True)

In [7]:
df_copy = normalize_numerics_values(df_copy)
df_copy = encoder_category_values(df_copy)

In [8]:
# Separamos en validacion y entrenamiento
X_train, X_val, y_train, y_val = train_test_split(df_copy, y, test_size=0.2, random_state=123)

In [9]:
# Modelado base para predicción del precio de la energía
price_model_baseline = LinearRegression(n_jobs=-1)
price_model_baseline.fit(X_train, y_train)

In [10]:
# predicción precio
pred_price = price_model_baseline.predict(X_val)

In [11]:
score_load = r2_score(y_val, pred_price)
print('La evaluación de la baseline es de:',round(score_load, 2))

La evaluación de la baseline es de: 0.39


In [12]:
# Prediccion con datos no vistos (conjunto de prueba/test)
y_test = df_test_copy['price_actual'].values
df_test_copy.drop(['time_hourly', 'price_actual'], axis = 1, inplace=True)
df_test_copy = normalize_numerics_values(df_test_copy)
df_test_copy = encoder_category_values(df_test_copy)

In [14]:
# Añadimos variables entrenadas faltantes en el conjunto de prueba (añade todo como 0)
df_test_copy[['city_name_ Barcelona','city_name_Bilbao', 'city_name_Madrid', 'city_name_Valencia', 'weather_main_dust', 
              'weather_main_haze', 'weather_main_snow', 'weather_icon_13', 
              'weather_icon_13d', 'weather_icon_13n']]=0
column_order = df_copy.columns
df_test_copy = df_test_copy[column_order]

In [15]:
pred_price_test = price_model_baseline.predict(df_test_copy)# prediccion con datos de prueba

In [16]:
# Evaluacion de baseline con datos de prueba precio energía
score_price_test = r2_score(y_test, pred_price_test)
print('La Baseline para el modelo de predicción del precio es de:',round(score_price_test, 2))

La Baseline para el modelo de predicción del precio es de: -1.406672795486969e+17


In [17]:
# Cracterística mas importantes en base al precio de la energía
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_train, y_train)

In [18]:
# Características mas representativas en el conjunto de datos
importances = zip(rfr.feature_importances_, X_train.columns)

sorted_importances = sorted(importances, key=lambda x: x[0], reverse=True)

for score, name in sorted_importances:
    print(f'{name}: {round(score, 2)}')

generation_fossil_gas: 0.23
generation_fossil_hard_coal: 0.13
generation_nuclear: 0.07
generation_other_renewable: 0.06
generation_hydro_run_of_river_and_poundage: 0.06
total_load_actual: 0.05
generation_waste: 0.05
generation_other: 0.04
generation_fossil_oil: 0.04
generation_hydro_water_reservoir: 0.04
generation_biomass: 0.03
pressure: 0.03
generation_hydro_pumped_storage_consumption: 0.03
generation_wind_onshore: 0.03
generation_solar: 0.03
generation_fossil_brown_coal_lignite: 0.02
temp_min: 0.01
wind_speed: 0.01
temp: 0.01
temp_max: 0.01
humidity: 0.0
wind_deg: 0.0
clouds_all: 0.0
weather_id: 0.0
weather_icon_01: 0.0
weather_icon_01n: 0.0
city_name_Seville: 0.0
weather_icon_01d: 0.0
weather_icon_04: 0.0
city_name_Bilbao: 0.0
weather_main_clear: 0.0
weather_icon_03: 0.0
weather_icon_02: 0.0
weather_icon_02n: 0.0
weather_icon_10n: 0.0
weather_icon_10: 0.0
weather_icon_04n: 0.0
weather_icon_03n: 0.0
city_name_Madrid: 0.0
city_name_ Barcelona: 0.0
city_name_Valencia: 0.0
weather_icon

### Objetivos

- La baseline en ambos casos (demanda y precio) es negativa, necesitando por tanto algoritmos capaces de extraer patrones e informacion relevante ante nuestros conjuntos de series temporales.

Usaremos estas puntaciones para desarrollar los modelos mas avanzados, mejorandolos en función de las puntuaciones dadas.