In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Cargar los datos de generación y meteorológicos
plant_1_generation = pd.read_csv('Plant_1_Generation_Data.csv')
plant_1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')

# Copiar los datos de generación para mantener df_GD1 limpio
df_GD1 = plant_1_generation.copy()

# Aplicar el mapeo a los nombres de los paneles solares
unique_source_keys_list = df_GD1['SOURCE_KEY'].unique()
source_key_mapping = {key: f"Solar_Panel_{i+1}" for i, key in enumerate(unique_source_keys_list)}
df_GD1['SOURCE_KEY'] = df_GD1['SOURCE_KEY'].map(source_key_mapping)

# Convertir la columna 'DATE_TIME' a formato de fecha y hora
df_GD1['DATE_TIME'] = pd.to_datetime(df_GD1['DATE_TIME'], format='%d-%m-%Y %H:%M')
plant_1_weather['DATE_TIME'] = pd.to_datetime(plant_1_weather['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

# Unir los datos meteorológicos al dataframe de generación en función de la fecha y hora
df_GD1_with_weather = pd.merge(df_GD1, plant_1_weather, on='DATE_TIME', how='left')

# Mostrar las primeras filas para verificar la unión
df_GD1_with_weather.head()


Unnamed: 0,DATE_TIME,PLANT_ID_x,SOURCE_KEY_x,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,PLANT_ID_y,SOURCE_KEY_y,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15,4135001,Solar_Panel_1,0.0,0.0,0.0,6259559.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
1,2020-05-15,4135001,Solar_Panel_2,0.0,0.0,0.0,6183645.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
2,2020-05-15,4135001,Solar_Panel_3,0.0,0.0,0.0,6987759.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
3,2020-05-15,4135001,Solar_Panel_4,0.0,0.0,0.0,7602960.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0
4,2020-05-15,4135001,Solar_Panel_5,0.0,0.0,0.0,7158964.0,4135001.0,HmiyD2TTLFNqkNe,25.184316,22.857507,0.0


In [3]:
# Eliminar las columnas PLANT_ID_y y SOURCE_KEY_y y renombrar las columnas PLANT_ID_x y SOURCE_KEY_x
df_GD1_with_weather_clean = df_GD1_with_weather.drop(columns=['PLANT_ID_y', 'SOURCE_KEY_y','PLANT_ID_x'])

# Renombrar las columnas para eliminar el sufijo '_x'
df_GD1_with_weather_clean = df_GD1_with_weather_clean.rename(columns={'SOURCE_KEY_x': 'SOURCE_KEY'})

df_GD1_with_weather_clean.head()


Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15,Solar_Panel_1,0.0,0.0,0.0,6259559.0,25.184316,22.857507,0.0
1,2020-05-15,Solar_Panel_2,0.0,0.0,0.0,6183645.0,25.184316,22.857507,0.0
2,2020-05-15,Solar_Panel_3,0.0,0.0,0.0,6987759.0,25.184316,22.857507,0.0
3,2020-05-15,Solar_Panel_4,0.0,0.0,0.0,7602960.0,25.184316,22.857507,0.0
4,2020-05-15,Solar_Panel_5,0.0,0.0,0.0,7158964.0,25.184316,22.857507,0.0


In [4]:
# Verificar si hay valores nulos
nulos = df_GD1_with_weather_clean.isnull().sum()

# Mostrar cuántos valores nulos hay por columna
print(nulos)

# Rellenar los valores nulos en solo las columnas numéricas con la media
numerical_cols = df_GD1_with_weather_clean.select_dtypes(include=['float64', 'int64']).columns #Estp para quitar los floats
df_GD1_with_weather_clean[numerical_cols] = df_GD1_with_weather_clean[numerical_cols].fillna(df_GD1_with_weather_clean[numerical_cols].mean())

# Mostrar las primeras filas después de limpiar los nulos
df_GD1_with_weather_clean.head()

df_GD1_limpio = df_GD1_with_weather_clean;


DATE_TIME              0
SOURCE_KEY             0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
AMBIENT_TEMPERATURE    4
MODULE_TEMPERATURE     4
IRRADIATION            4
dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


features = ['AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD','AMBIENT_TEMPERATURE', 
            'MODULE_TEMPERATURE', 'IRRADIATION']
# Definir X (variables predictoras) y Y (variable objetivo)
X = df_GD1_limpio[features]
Y = df_GD1_limpio['DC_POWER']

# Dividir los datos en conjunto de entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Normalizar las características (X) usando MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo con los datos normalizados
model.fit(X_train_scaled, Y_train)

# Hacer predicciones con el conjunto de prueba
Y_pred = model.predict(X_test_scaled)

# Evaluar el modelo con el MSE (Mean Squared Error)
mse = mean_squared_error(Y_test, Y_pred)
print(f"Error Cuadrático Medio (MSE): {mse}")

# Coeficientes del modelo
print("Coeficientes del modelo:", model.coef_)



Error Cuadrático Medio (MSE): 174.3629889280183
Coeficientes del modelo: [ 1.44092901e+04  4.53196859e+00  8.74380261e-02  8.01616096e-01
 -4.32499644e+01  7.88357169e+01]


___
# Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

RF_model = RandomForestRegressor(n_estimators=350, random_state=42)

RF_model.fit(X_train_scaled,Y_train)

RF_y_pred = RF_model.predict(X_test_scaled)

RF_mse = mean_squared_error(Y_test,RF_y_pred)
RF_r2 = r2_score(Y_test,RF_y_pred)
RF_mae = mean_absolute_error(Y_test,RF_y_pred)
#RF_mape = mean_absolute_percentage_error(Y_test,RF_y_pred)

print(f'Random Forest - Error Cuadrático Medio (MSE): {RF_mse:.3f}')
print(f'Random Forest - Coeficiente de determinación (R^2):{RF_r2:.2f}')
print(f'Random Forest - Error Absoluto Medio (MAE): {RF_mae:.3f}')

Random Forest - Error Cuadrático Medio (MSE): 96.703
Random Forest - Coeficiente de determinación (R^2):1.00
Random Forest - Error Absoluto Medio (MAE): 1.354


## Resultado del modelo Random Forest

Random Forest - Error Cuadrático Medio (MSE): 96.70265742067654

Random Forest - Coeficiente de determinación (R^2):0.9999940139142317

Random Forest - Error Absoluto Medio (MAE): 1.3542036240045914

___
# Validación cruzada

In [7]:
from sklearn.model_selection import cross_val_score

CV_model = cross_val_score(RF_model, X_train_scaled, Y_train, cv=10, scoring='r2')

print(f"Cross-Validation R² scores: {CV_model}")
print(f"R² medio en validación cruzada: {CV_model.mean()}")

Cross-Validation R² scores: [0.99999922 0.99999955 0.99999896 0.9999992  0.99999904 0.99999958
 0.99999957 0.99999949 0.9999987  0.99999925]
R² medio en validación cruzada: 0.9999992555155119


## Resultados de la Validacion Cruzada

Cross-Validation R² scores: [0.99999922 0.99999955 0.99999896 0.9999992  0.99999904 0.99999958 0.99999957 0.99999949 0.9999987  0.99999925]

R² medio en validación cruzada: 0.9999992555155119