# Predicción de producción de energía

In [1]:
# Cargando librerias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.precision', 3)

## Cargando los datos al Dataframe

In [3]:
# Cargamos la información de todas las plantas y sensores
df_plant1_gen = pd.read_csv('DATASETS/Plant_1_Generation_Data.csv', parse_dates=["DATE_TIME"])
df_plant1_weather = pd.read_csv('DATASETS/Plant_1_Weather_Sensor_Data.csv', parse_dates=["DATE_TIME"])
df_plant2_gen = pd.read_csv('DATASETS/Plant_2_Generation_Data.csv', parse_dates=["DATE_TIME"])
df_plant2_weather = pd.read_csv('DATASETS/Plant_2_Weather_Sensor_Data.csv', parse_dates=["DATE_TIME"])

## Combinando los Dataframes para generar 1 solo por tipo

In [4]:
# Concatenamos la informacion de la planta 1 y 2
df_gen = pd.concat([df_plant1_gen, df_plant2_gen], ignore_index=True)

In [5]:
# Concatenamos la informacion de la planta 1 y 2
df_sensors = pd.concat([df_plant1_weather, df_plant2_weather], ignore_index=True)

## Uniendo los Dataframes de generación de energía y sensores

In [6]:
df = df_gen.merge(df_sensors, on=["DATE_TIME", "PLANT_ID"], suffixes=("_GENERATION", "_WEATHER"))
df.head()

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY_GENERATION,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,SOURCE_KEY_WEATHER,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.0,6260000.0,HmiyD2TTLFNqkNe,25.184,22.858,0.0
1,2020-05-15,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.0,6184000.0,HmiyD2TTLFNqkNe,25.184,22.858,0.0
2,2020-05-15,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.0,6988000.0,HmiyD2TTLFNqkNe,25.184,22.858,0.0
3,2020-05-15,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.0,7603000.0,HmiyD2TTLFNqkNe,25.184,22.858,0.0
4,2020-05-15,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.0,7159000.0,HmiyD2TTLFNqkNe,25.184,22.858,0.0


### Revisando que no haya datos nulos

In [7]:
df.isnull().sum()

DATE_TIME                0
PLANT_ID                 0
SOURCE_KEY_GENERATION    0
DC_POWER                 0
AC_POWER                 0
DAILY_YIELD              0
TOTAL_YIELD              0
SOURCE_KEY_WEATHER       0
AMBIENT_TEMPERATURE      0
MODULE_TEMPERATURE       0
IRRADIATION              0
dtype: int64

In [8]:
# Copiamos el DF original
df2 = df.copy()

In [26]:
# Seleccionamos la variable dependiente e independientes
X = df2[['DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','DC_POWER']]
y = df2[['AC_POWER']]

In [31]:
# Generamos nuestras variables de Test y Entrenamiento
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.2, random_state=21)

In [32]:
# Generamos el modelo
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
# Score R2
score_lr = 100*lr_clf.score(X_test,y_test)
print(f'LR Model score = {score_lr:4.4f}%')

LR Model score = 83.7140%


In [33]:
# Probamos con los datos de prueba
lr = LinearRegression().fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
R2_Score_lr = round(r2_score(y_pred_lr, y_test) * 100, 2)

print("R2 Score : ",R2_Score_lr,"%")

R2 Score :  80.57 %


In [34]:
# Intercepto
lr_clf.intercept_

array([-146.00232128])

In [35]:
# Coeficientes
lr_clf.coef_

array([[ 2.39597447e-03,  1.64099542e-08,  1.32447358e+01,
        -7.63735091e+00,  9.91311839e+02,  3.98750112e-02]])

$B_0 = -146.00232127766094$

$B_1 = 2.39597447e-03$

$B_2 = 1.64099542e-08$

$B_3 = 1.32447358e+01$

$B_4 = -7.63735091e+00$

$B_5 = 9.91311839e+02$

$B_6 = 3.98750112e-02$

In [39]:
from scipy.stats import pearsonr

sc1, pval1 = pearsonr(X['DAILY_YIELD'], y['AC_POWER'])
sc2, pval2 = pearsonr(X['TOTAL_YIELD'], y['AC_POWER'])
sc3, pval3 = pearsonr(X['AMBIENT_TEMPERATURE'], y['AC_POWER'])
sc4, pval4 = pearsonr(X['MODULE_TEMPERATURE'], y['AC_POWER'])
sc5, pval5 = pearsonr(X['IRRADIATION'], y['AC_POWER'])
sc6, pval6 = pearsonr(X['DC_POWER'], y['AC_POWER'])

pvalues = np.array([pval1, pval2, pval3, pval4, pval5, pval6])

print("p-valores: {}\n".format(pvalues))

print("Coef con significancia:", pvalues[pvalues <= 0.05])

p-valores: [1.42711363e-67 3.44787612e-60 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]

coef con significancia [1.42711363e-67 3.44787612e-60 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]


# Formula

$\hat y =  2.39597447e^{-3}x_1 + 1.64099542e^{-8}x_2$ + 1.32447358e+01 $ x_3 $ -7.63735091e+00 $ x_4 $ + 9.91311839e+02 $ x_5 $ + 3.98750112e-02 $ x_6 $ -146.00232127766094

# Con StatsModel

In [40]:
import statsmodels.api as sm

In [43]:
modelo = sm.OLS(y, X)
res = modelo.fit()

In [44]:
res.summary()

0,1,2,3
Dep. Variable:,AC_POWER,R-squared (uncentered):,0.89
Model:,OLS,Adj. R-squared (uncentered):,0.89
Method:,Least Squares,F-statistic:,183800.0
Date:,"Tue, 06 Jun 2023",Prob (F-statistic):,0.0
Time:,23:26:35,Log-Likelihood:,-882520.0
No. Observations:,136472,AIC:,1765000.0
Df Residuals:,136466,BIC:,1765000.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
DAILY_YIELD,0.0050,0.000,33.190,0.000,0.005,0.005
TOTAL_YIELD,2.038e-08,7.38e-10,27.611,0.000,1.89e-08,2.18e-08
AMBIENT_TEMPERATURE,4.9940,0.219,22.799,0.000,4.565,5.423
MODULE_TEMPERATURE,-5.4517,0.231,-23.641,0.000,-5.904,-5.000
IRRADIATION,996.8809,6.337,157.304,0.000,984.460,1009.302
DC_POWER,0.0371,0.000,181.608,0.000,0.037,0.037

0,1,2,3
Omnibus:,69189.839,Durbin-Watson:,1.57
Prob(Omnibus):,0.0,Jarque-Bera (JB):,885258.425
Skew:,-2.14,Prob(JB):,0.0
Kurtosis:,14.72,Cond. No.,10400000000.0


In [50]:
# Significancia
res.pvalues < 0.05

DAILY_YIELD            True
TOTAL_YIELD            True
AMBIENT_TEMPERATURE    True
MODULE_TEMPERATURE     True
IRRADIATION            True
DC_POWER               True
dtype: bool

In [53]:
# R^2
res.rsquared

0.8898712741285061

In [52]:
# Coefiencientes
res.params

DAILY_YIELD            4.966e-03
TOTAL_YIELD            2.038e-08
AMBIENT_TEMPERATURE    4.994e+00
MODULE_TEMPERATURE    -5.452e+00
IRRADIATION            9.969e+02
DC_POWER               3.707e-02
dtype: float64

$\hat y =  $ 4.966e-03 $ x_1 $ + $ 2.038e^{-8}x_2  + $ 4.994e+00 $ x_3 $ - 5.452e+00 $ x_4 $ + 9.969e+02 $ x_5 $ + 3.707e-02 $ x_6 $

> El segundo modelo tiene mejor porcentaje de predicción