In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from script_automation.automation_script_process_data import run_script_processing_data
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

In [18]:
df = pd.read_parquet('../../data_parquet/train_data.parquet')

In [None]:
df_processed = run_script_processing_data(df) # Procesar datos

In [20]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144386 entries, 0 to 160563
Data columns (total 25 columns):
 #   Column                                       Non-Null Count   Dtype         
---  ------                                       --------------   -----         
 0   time_hourly                                  144386 non-null  datetime64[ns]
 1   generation_biomass                           144386 non-null  int64         
 2   generation_fossil_brown_coal_lignite         144386 non-null  int64         
 3   generation_fossil_gas                        144386 non-null  int64         
 4   generation_fossil_hard_coal                  144386 non-null  float64       
 5   generation_fossil_oil                        144386 non-null  int64         
 6   generation_hydro_pumped_storage_consumption  144386 non-null  int64         
 7   generation_hydro_run_of_river_and_poundage   144386 non-null  int64         
 8   generation_hydro_water_reservoir             144386 non-null  float64

In [21]:
# Separar en variables dependiente y independiente
y = df_processed['total_load_actual'].values
time_hourly = df_processed['time_hourly'].values
df_processed.drop(['total_load_actual', 'time_hourly'], axis = 1, inplace=True)
X = df_processed

In [22]:
# Escalamos variables numericos
def standarize(X):
    numeric_columns = X.select_dtypes(include='number')
    scaler = StandardScaler()

    X[numeric_columns.columns] = scaler.fit_transform(numeric_columns)

    return X

In [23]:
X = standarize(X)

In [24]:
# codificamos variables categóricas
def encoder(X):
    categorical_columns = X.select_dtypes(include='object')
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_data = encoder.fit_transform(categorical_columns)

    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns.columns))
    encoded_df.index = X.index
    X = pd.concat([X.drop(categorical_columns.columns, axis=1), encoded_df], axis=1)
    return X

In [25]:
X = encoder(X)

In [26]:
# Dividimos en validacion y entrenamiento
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

### Modelado

**Regresión Lineal simple**

In [27]:
# Regrsion lineal basica
linear_regression_model = LinearRegression(n_jobs=-1)
linear_regression_model.fit(X_train, y_train)

In [28]:
# predicción demanda
pred_load = linear_regression_model.predict(X_val)

In [29]:
print('La evaluación de la demanda en la regresion lineal es de:',cross_val_score(linear_regression_model, X_train, y_train, cv=10).mean())

La evaluación de la demanda en la regresion lineal es de: 0.8211651915427736


**Random Forest**

In [30]:
# modelo de random forest
rnd_model_regressor = RandomForestRegressor(n_estimators=100, criterion='squared_error',random_state=123, n_jobs=-1)
rnd_model_regressor.fit(X_train, y_train)

In [31]:
# predicción demanda
pred_load = rnd_model_regressor.predict(X_val)

In [45]:
print('raiz cuadratica medio en validación:',round(root_mean_squared_error(y_val, pred_load),2))

raiz cuadratica medio en validación: 345.71


In [32]:
print('La evaluación de la demanda en el RandomForest es de:',cross_val_score(rnd_model_regressor, X_train, y_train, cv=10).mean())

La evaluación de la demanda en el RandomForest es de: 0.9914307035341337


In [35]:
# Características mas representativas en el conjunto de datos
importances = zip(rnd_model_regressor.feature_importances_, X.columns)

sorted_importances = sorted(importances, key=lambda x: x[0], reverse=True)

for score, name in sorted_importances:
    print(f'{name}: {round(score, 2)}')

generation_hydro_pumped_storage_consumption: 0.38
generation_wind_onshore: 0.1
generation_hydro_water_reservoir: 0.1
generation_fossil_oil: 0.1
generation_solar: 0.09
generation_fossil_gas: 0.08
generation_fossil_hard_coal: 0.04
generation_hydro_run_of_river_and_poundage: 0.03
generation_waste: 0.02
price_actual: 0.02
weather_icon_01n: 0.01
temp_pca: 0.01
generation_other: 0.0
generation_fossil_brown_coal_lignite: 0.0
generation_biomass: 0.0
humidity: 0.0
generation_other_renewable: 0.0
weather_icon_02n: 0.0
weather_icon_01d: 0.0
wind_deg: 0.0
weather_icon_04n: 0.0
weather_icon_10: 0.0
wind_speed: 0.0
weather_icon_10n: 0.0
weather_icon_03n: 0.0
weather_icon_04d: 0.0
weather_icon_02d: 0.0
weather_icon_01: 0.0
weather_main_clouds: 0.0
weather_icon_03d: 0.0
weather_main_clear: 0.0
clouds_all: 0.0
weather_main_rain: 0.0
weather_icon_10d: 0.0
weather_icon_02: 0.0
weather_icon_03: 0.0
city_name_Bilbao: 0.0
city_name_Valencia: 0.0
city_name_Madrid: 0.0
city_name_Seville: 0.0
city_name_ Barcel

**Conjunto de prueba**

In [36]:
df_prueba = pd.read_parquet('../../data_parquet/test_data.parquet')

In [None]:
df_processed_prueba = run_script_processing_data(df_prueba) # Procesar datos

In [38]:
# Separar en variables dependiente y independiente
y_test = df_processed_prueba['total_load_actual'].values
time_hourly = df_processed_prueba['time_hourly'].values
df_processed_prueba.drop(['total_load_actual', 'time_hourly'], axis = 1, inplace=True)
X_test = df_processed_prueba

In [39]:
X_test = standarize(X_test) # estandarizacion 

In [40]:
X_test = encoder(X_test) # codificacion

In [41]:
pred_price_test = rnd_model_regressor.predict(X_test) # Prediccion 

In [46]:
print('raiz cuadratica medio en test es de:',round(root_mean_squared_error(y_test, pred_price_test),2))

raiz cuadratica medio en test es de: 2827.5


In [42]:
# Metrica en conjunto de prueba
r2 = r2_score(y_test, pred_price_test)
print('Score en conjunto de prueba:',round(r2,2))

Score en conjunto de prueba: 0.6
