In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [2]:
# Loading time series data

df = pd.read_csv('/Users/cececarino/Desktop/PE/Spot price forecasting/[Final] Datasets/NSW Forecasting Dataset.csv')
df.head()

Unnamed: 0,I,ROOFTOP,ACTUAL,2,INTERVAL_DATETIME,REGIONID,POWER,QI,TYPE,LASTCHANGED,REGION,SETTLEMENTDATE,TOTALDEMAND,RRP,PERIODTYPE
0,D,ROOFTOP,ACTUAL,2.0,2022-10-13 07:00:00,NSW1,588.646,1.0,MEASUREMENT,2022/10/13 07:19:45,NSW1,2022-10-13 07:00:00,8011.43,108.35,TRADE
1,D,ROOFTOP,ACTUAL,2.0,2022-10-13 07:30:00,NSW1,938.043,1.0,MEASUREMENT,2022/10/13 07:49:44,NSW1,2022-10-13 07:30:00,7605.55,119.79,TRADE
2,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:00:00,NSW1,1299.804,1.0,MEASUREMENT,2022/10/13 08:19:44,NSW1,2022-10-13 08:00:00,7215.51,-0.02,TRADE
3,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:30:00,NSW1,1622.183,1.0,MEASUREMENT,2022/10/13 08:49:50,NSW1,2022-10-13 08:30:00,7169.87,128.25,TRADE
4,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:00:00,NSW1,1781.731,1.0,MEASUREMENT,2022/10/13 09:19:48,NSW1,2022-10-13 09:00:00,7110.1,150.75,TRADE


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9664 entries, 0 to 9663
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   I                  9664 non-null   object 
 1   ROOFTOP            9664 non-null   object 
 2   ACTUAL             9664 non-null   object 
 3   2                  9664 non-null   float64
 4   INTERVAL_DATETIME  9664 non-null   object 
 5   REGIONID           9664 non-null   object 
 6   POWER              9664 non-null   float64
 7   QI                 9664 non-null   float64
 8   TYPE               9664 non-null   object 
 9   LASTCHANGED        9664 non-null   object 
 10  REGION             9664 non-null   object 
 11  SETTLEMENTDATE     9664 non-null   object 
 12  TOTALDEMAND        9664 non-null   float64
 13  RRP                9664 non-null   float64
 14  PERIODTYPE         9664 non-null   object 
dtypes: float64(5), object(10)
memory usage: 1.1+ MB


## OLS ($PV$)

In [10]:
train_case, test_case = train_test_split(df, test_size=0.2, random_state=142)
print('Train shape: ', train_case.shape)
print('Test shape: ', test_case.shape)

Train shape:  (7731, 15)
Test shape:  (1933, 15)


In [15]:
reg = linear_model.LinearRegression()
X_train = train_case[['POWER']]
y_train = train_case['RRP']

X_test = test_case[['POWER']]
y_test = test_case['RRP']

reg.fit(X_train, y_train)

In [23]:
predicted = reg.predict(X_test)
mse = ((np.array(y_test)-predicted)**2).sum()/len(y_test)
rmse = np.sqrt(mse)
print("reg MSE:", mse)
print("reg Root MSE:", rmse)

reg MSE: 184647.8150589837
reg Root MSE: 429.7066616413849


In [25]:
import statsmodels.api as sm

X_train = sm.add_constant(X_train)  

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     227.9
Date:                Tue, 28 Nov 2023   Prob (F-statistic):           8.90e-51
Time:                        16:10:17   Log-Likelihood:                -58382.
No. Observations:                7731   AIC:                         1.168e+05
Df Residuals:                    7729   BIC:                         1.168e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        219.4644      9.467     23.182      0.0

1. R-squared: 

- 0.029 indicates that the model explains a very small proportion of the variation in the dependent variable (RRP).

- This means that there are other factors that are more important in determining changes in RRP.

2. F-statistic: 

- The F-statistic of 227.9 is highly significant (p-value = 8.90e-51), which means that the model is a good fit for the data.

3. t-statistic: 

- The t-statistic for the explanatory variable (POWER--Actual PV) is -15.097, which is also highly significant (p-value = 0.000). 

- This means strong negative relationship between changes in PV generation and changes in RRP.

4. Condition number: 

- The condition number of 3.76e+03 is very large, which indicates that there is strong multicollinearity in the data. 

- This means that the explanatory variables are highly correlated with each other, which can make it difficult to interpret the individual coefficients.

In [26]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train = sm.add_constant(X_train)

vif = pd.DataFrame()
vif["Variable"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

# Display the VIF values
print(vif)

  Variable       VIF
0    const  3.263969
1    POWER  1.000000


- A VIF greater than 5 indicates that there is a strong correlation between the explanatory variable and the other explanatory variables.

- Severe multicollinearity exists, variables should be transformed.

## OLS ($\text{PV}+\text{total demand}$)

In [4]:
df2 = pd.read_csv('/Users/cececarino/Desktop/PE/Spot price forecasting/[Final] Datasets/NEM OLS.csv')

In [7]:
df2.dropna(inplace=True)

# Display the cleaned DataFrame
df2.head()

Unnamed: 0,I,ROOFTOP,ACTUAL,2,INTERVAL_DATETIME,REGIONID,POWER,QI,TYPE,LASTCHANGED,...,REGION_y,TOTALDEMAND_y,RRP_y,PERIODTYPE_y,REGION,TOTALDEMAND,RRP,PERIODTYPE,NEMDEMAND,Npercentdemand
0,D,ROOFTOP,ACTUAL,2,2023-04-18 07:30:00,NSW1,406.466,1.0,MEASUREMENT,4/18/23 7:49,...,VIC1,4947.7,99.48,TRADE,NSW1,7535.22,91.16,TRADE,18897.97,0.398732
1,D,ROOFTOP,ACTUAL,2,2023-04-18 07:30:00,NSW1,406.466,1.0,MEASUREMENT,4/18/23 7:49,...,VIC1,4947.7,99.48,TRADE,NSW1,7535.22,91.16,TRADE,18897.97,0.398732
2,D,ROOFTOP,ACTUAL,2,2023-04-18 07:30:00,NSW1,563.124,0.6,SATELLITE,4/18/23 7:50,...,VIC1,4947.7,99.48,TRADE,NSW1,7535.22,91.16,TRADE,18897.97,0.398732
3,D,ROOFTOP,ACTUAL,2,2023-04-18 07:30:00,NSW1,563.124,0.6,SATELLITE,4/18/23 7:50,...,VIC1,4947.7,99.48,TRADE,NSW1,7535.22,91.16,TRADE,18897.97,0.398732
4,D,ROOFTOP,ACTUAL,2,2023-04-22 08:00:00,NSW1,603.283,1.0,MEASUREMENT,4/22/23 8:19,...,VIC1,4561.59,59.01,TRADE,NSW1,6963.16,57.25,TRADE,16923.04,0.41146


In [8]:
train_case, test_case = train_test_split(df2, test_size=0.2, random_state=142)
print('Train shape: ', train_case.shape)
print('Test shape: ', test_case.shape)

Train shape:  (28316, 25)
Test shape:  (7080, 25)


In [9]:
reg = linear_model.LinearRegression()
X_train = train_case[['POWER', 'TOTALDEMAND']]
y_train = train_case['RRP']

X_test = test_case[['POWER', 'TOTALDEMAND']]
y_test = test_case['RRP']

reg.fit(X_train, y_train)

In [10]:
predicted = reg.predict(X_test)
mse = ((np.array(y_test)-predicted)**2).sum()/len(y_test)
rmse = np.sqrt(mse)
print("reg MSE:", mse)
print("reg Root MSE:", rmse)

reg MSE: 111081.18693390192
reg Root MSE: 333.2884440449472


In [12]:
import statsmodels.api as sm

X_train = sm.add_constant(X_train)  

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     587.6
Date:                Wed, 29 Nov 2023   Prob (F-statistic):          9.14e-251
Time:                        16:24:27   Log-Likelihood:            -2.0520e+05
No. Observations:               28316   AIC:                         4.104e+05
Df Residuals:                   28313   BIC:                         4.104e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const        -158.3412     19.338     -8.188      

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train = sm.add_constant(X_train)

vif = pd.DataFrame()
vif["Variable"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

# Display the VIF values
print(vif)

      Variable        VIF
0        const  91.799007
1        POWER   2.178062
2  TOTALDEMAND   2.178062



**const (Constant):** 
    
- A VIF of 91.799007 indicates that there is a high level of multicollinearity associated with the constant term in your regression model.

**POWER:**
    
- A VIF of 2.178062 indicates that there is low to moderate multicollinearity associated with the 'POWER' variable. - Generally, a VIF below 5 is considered acceptable, so 'POWER' seems to be reasonably independent.

**TOTALDEMAND:**
    
- Similar to 'POWER', 'TOTALDEMAND' also has a VIF of 2.178062, indicating low to moderate multicollinearity.

1. The constant term ('const') is likely highly correlated with other variables, which might suggest issues with the model specification or potential redundancy in the features.


2. 'POWER' and 'TOTALDEMAND' have VIFs indicating low to moderate multicollinearity, which is generally acceptable.


## OLS ($\text{PV}+\frac{\text{PV}}{\text{demand}}$)

#### Creating new column for $\frac{\text{PV}}{\text{demand}}$

In [15]:
df2['PVdemand'] = df2['POWER']/df2['TOTALDEMAND']

In [17]:
train_case, test_case = train_test_split(df2, test_size=0.2, random_state=142)
print('Train shape: ', train_case.shape)
print('Test shape: ', test_case.shape)

Train shape:  (28316, 26)
Test shape:  (7080, 26)


In [18]:
reg = linear_model.LinearRegression()
X_train = train_case[['POWER', 'PVdemand']]
y_train = train_case['RRP']

X_test = test_case[['POWER', 'PVdemand']]
y_test = test_case['RRP']

reg.fit(X_train, y_train)

In [19]:
predicted = reg.predict(X_test)
mse = ((np.array(y_test)-predicted)**2).sum()/len(y_test)
rmse = np.sqrt(mse)
print("reg MSE:", mse)
print("reg Root MSE:", rmse)

reg MSE: 111809.41987144918
reg Root MSE: 334.3791558567148


In [20]:
X_train = sm.add_constant(X_train)  

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     430.2
Date:                Wed, 29 Nov 2023   Prob (F-statistic):          8.71e-185
Time:                        16:46:35   Log-Likelihood:            -2.0535e+05
No. Observations:               28316   AIC:                         4.107e+05
Df Residuals:                   28313   BIC:                         4.107e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        173.9843      4.452     39.079      0.0

In [21]:
X_train = sm.add_constant(X_train)

vif = pd.DataFrame()
vif["Variable"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

# Display the VIF values
print(vif)

   Variable        VIF
0     const   4.813719
1     POWER  12.314122
2  PVdemand  12.314122


VIF of constant is low but for POWER and PVdemand, it is high and greater than 10 which may imply high multicollinearity.

## OLS ($\frac{\text{PV}}{\text{Total demand}}$)

In [22]:
reg = linear_model.LinearRegression()
X_train = train_case[['PVdemand']]
y_train = train_case['RRP']

X_test = test_case[['PVdemand']]
y_test = test_case['RRP']

reg.fit(X_train, y_train)

In [23]:
X_train = sm.add_constant(X_train)  

model = sm.OLS(y_train, X_train)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     842.0
Date:                Tue, 05 Dec 2023   Prob (F-statistic):          1.83e-182
Time:                        09:25:55   Log-Likelihood:            -2.0536e+05
No. Observations:               28316   AIC:                         4.107e+05
Df Residuals:                   28314   BIC:                         4.107e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        162.2098      3.476     46.671      0.0

In [24]:
X_train = sm.add_constant(X_train)

vif = pd.DataFrame()
vif["Variable"] = X_train.columns
vif["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]

# Display the VIF values
print(vif)

   Variable       VIF
0     const  2.931964
1  PVdemand  1.000000
