### Necessary packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta
import statsmodels.api as sm

### Dataset

In [2]:
# Loading time series data

df = pd.read_csv('/Users/cececarino/Desktop/PE/Spot price forecasting/[Final] Datasets/NSW Forecasting Dataset.csv')
df.head()

Unnamed: 0,I,ROOFTOP,ACTUAL,2,INTERVAL_DATETIME,REGIONID,POWER,QI,TYPE,LASTCHANGED,REGION,SETTLEMENTDATE,TOTALDEMAND,RRP,PERIODTYPE
0,D,ROOFTOP,ACTUAL,2.0,2022-10-13 07:00:00,NSW1,588.646,1.0,MEASUREMENT,2022/10/13 07:19:45,NSW1,2022-10-13 07:00:00,8011.43,108.35,TRADE
1,D,ROOFTOP,ACTUAL,2.0,2022-10-13 07:30:00,NSW1,938.043,1.0,MEASUREMENT,2022/10/13 07:49:44,NSW1,2022-10-13 07:30:00,7605.55,119.79,TRADE
2,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:00:00,NSW1,1299.804,1.0,MEASUREMENT,2022/10/13 08:19:44,NSW1,2022-10-13 08:00:00,7215.51,-0.02,TRADE
3,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:30:00,NSW1,1622.183,1.0,MEASUREMENT,2022/10/13 08:49:50,NSW1,2022-10-13 08:30:00,7169.87,128.25,TRADE
4,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:00:00,NSW1,1781.731,1.0,MEASUREMENT,2022/10/13 09:19:48,NSW1,2022-10-13 09:00:00,7110.1,150.75,TRADE


### Filtering to daylight saving time

In [3]:
# Convert the 'time' column to datetime format
df['INTERVAL_DATETIME'] = pd.to_datetime(df['INTERVAL_DATETIME'])

# Filter the DataFrame for the time range from 8 am to 6 pm
start_time = pd.to_datetime('08:00:00').time()
end_time = pd.to_datetime('18:00:00').time()

df = df[(df['INTERVAL_DATETIME'].dt.time >= start_time) & (df['INTERVAL_DATETIME'].dt.time <= end_time)]

# Display the filtered DataFrame
df.head()

Unnamed: 0,I,ROOFTOP,ACTUAL,2,INTERVAL_DATETIME,REGIONID,POWER,QI,TYPE,LASTCHANGED,REGION,SETTLEMENTDATE,TOTALDEMAND,RRP,PERIODTYPE
2,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:00:00,NSW1,1299.804,1.0,MEASUREMENT,2022/10/13 08:19:44,NSW1,2022-10-13 08:00:00,7215.51,-0.02,TRADE
3,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:30:00,NSW1,1622.183,1.0,MEASUREMENT,2022/10/13 08:49:50,NSW1,2022-10-13 08:30:00,7169.87,128.25,TRADE
4,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:00:00,NSW1,1781.731,1.0,MEASUREMENT,2022/10/13 09:19:48,NSW1,2022-10-13 09:00:00,7110.1,150.75,TRADE
5,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:30:00,NSW1,1889.334,1.0,MEASUREMENT,2022/10/13 09:49:47,NSW1,2022-10-13 09:30:00,6934.22,100.3,TRADE
6,D,ROOFTOP,ACTUAL,2.0,2022-10-13 10:00:00,NSW1,1952.567,1.0,MEASUREMENT,2022/10/13 10:20:34,NSW1,2022-10-13 10:00:00,7013.66,150.75,TRADE


In [4]:
df2 = df.copy()
df2['PVdemand'] = df2['POWER']/df2['TOTALDEMAND']
df2.head()

Unnamed: 0,I,ROOFTOP,ACTUAL,2,INTERVAL_DATETIME,REGIONID,POWER,QI,TYPE,LASTCHANGED,REGION,SETTLEMENTDATE,TOTALDEMAND,RRP,PERIODTYPE,PVdemand
2,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:00:00,NSW1,1299.804,1.0,MEASUREMENT,2022/10/13 08:19:44,NSW1,2022-10-13 08:00:00,7215.51,-0.02,TRADE,0.18014
3,D,ROOFTOP,ACTUAL,2.0,2022-10-13 08:30:00,NSW1,1622.183,1.0,MEASUREMENT,2022/10/13 08:49:50,NSW1,2022-10-13 08:30:00,7169.87,128.25,TRADE,0.22625
4,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:00:00,NSW1,1781.731,1.0,MEASUREMENT,2022/10/13 09:19:48,NSW1,2022-10-13 09:00:00,7110.1,150.75,TRADE,0.250592
5,D,ROOFTOP,ACTUAL,2.0,2022-10-13 09:30:00,NSW1,1889.334,1.0,MEASUREMENT,2022/10/13 09:49:47,NSW1,2022-10-13 09:30:00,6934.22,100.3,TRADE,0.272465
6,D,ROOFTOP,ACTUAL,2.0,2022-10-13 10:00:00,NSW1,1952.567,1.0,MEASUREMENT,2022/10/13 10:20:34,NSW1,2022-10-13 10:00:00,7013.66,150.75,TRADE,0.278395


## OLS model ( $PV$ )

In [5]:
train_case, test_case = train_test_split(df2, test_size=0.2, random_state=142)
print('Train shape: ', train_case.shape)
print('Test shape: ', test_case.shape)

Train shape:  (6765, 16)
Test shape:  (1692, 16)


In [6]:
reg_1 = linear_model.LinearRegression()
X_train1 = train_case[['POWER']]
y_train1 = train_case['RRP']

X_test1 = test_case[['POWER']]
y_test1 = test_case['RRP']

results1 = reg_1.fit(X_train1, y_train1)

In [7]:
predicted1 = reg_1.predict(X_test1)
mse1 = ((np.array(y_test1)-predicted1)**2).sum()/len(y_test1)
rmse1 = np.sqrt(mse1)
print("reg MSE:", mse1)
print("reg Root MSE:", rmse1)

reg MSE: 7665.313288912498
reg Root MSE: 87.55177490441012


In [8]:
X_train1 = sm.add_constant(X_train1)  

model1 = sm.OLS(y_train1, X_train1)
results1 = model1.fit()

print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     198.8
Date:                Tue, 05 Dec 2023   Prob (F-statistic):           1.65e-44
Time:                        09:52:06   Log-Likelihood:                -51788.
No. Observations:                6765   AIC:                         1.036e+05
Df Residuals:                    6763   BIC:                         1.036e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        251.2706     12.491     20.116      0.0

## OLS model ( $\frac{\text{PV}}{\text{demand}}$ )

In [9]:
reg_2 = linear_model.LinearRegression()
X_train2 = train_case[['PVdemand']]
y_train2 = train_case['RRP']

X_test2 = test_case[['PVdemand']]
y_test2 = test_case['RRP']

results2 = reg_2.fit(X_train2, y_train2)

In [10]:
predicted2 = reg_2.predict(X_test2)
mse2 = ((np.array(y_test2)-predicted2)**2).sum()/len(y_test2)
rmse2 = np.sqrt(mse2)
print("reg MSE:", mse2)
print("reg Root MSE:", rmse2)

reg MSE: 7388.671089731619
reg Root MSE: 85.95737949548962


In [11]:
X_train2 = sm.add_constant(X_train2)  

model2 = sm.OLS(y_train2, X_train2)
results2 = model2.fit()

print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:                    RRP   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     178.0
Date:                Tue, 05 Dec 2023   Prob (F-statistic):           4.29e-40
Time:                        09:52:06   Log-Likelihood:                -51798.
No. Observations:                6765   AIC:                         1.036e+05
Df Residuals:                    6763   BIC:                         1.036e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        215.7273     10.768     20.035      0.0