In [100]:
import pandas as pd
import numpy as np
import math
import time
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import grangercausalitytests
from sklearn import preprocessing
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [101]:
data = pd.read_excel('../../data/market_data.xlsx')
training = data.loc[(data['Time (UTC+10)'] >= '2018-01-01') &
                            (data['Time (UTC+10)'] < '2021-01-01')]
training_vic = training[['Time (UTC+10)', 
                         'Regions VIC Trading Price ($/MWh)',
                         'Regions VIC Trading Total Intermittent Generation (MW)',
                         'Regions VIC Operational Demand (MW)']]

training_tas = training[['Time (UTC+10)', 
                         'Regions TAS Trading Price ($/MWh)',
                         'Regions TAS Trading Total Intermittent Generation (MW)',
                         'Regions TAS Operational Demand (MW)']]

training_sa = training[['Time (UTC+10)', 
                         'Regions SA Trading Price ($/MWh)',
                         'Regions SA Trading Total Intermittent Generation (MW)',
                         'Regions SA Operational Demand (MW)']]

training_nsw = training[['Time (UTC+10)', 
                         'Regions NSW Trading Price ($/MWh)',
                         'Regions NSW Trading Total Intermittent Generation (MW)',
                         'Regions NSW Operational Demand (MW)']]

In [102]:
test = pd.read_csv('../../data/bonus_testing.csv')

In [103]:
test.head(10)

Unnamed: 0,Time (UTC+10),Regions NSW Trading Total Intermittent Generation (MW),Regions SA Trading Total Intermittent Generation (MW),Regions TAS Trading Total Intermittent Generation (MW),Regions VIC Trading Total Intermittent Generation (MW),Regions NSW Operational Demand (MW),Regions SA Operational Demand (MW),Regions TAS Operational Demand (MW),Regions VIC Operational Demand (MW)
0,2021-07-01 00:00:00,4.35,4.76,83.7,184.6,8521,1706,1197,5410
1,2021-07-01 00:30:00,5.26,4.92,77.71,175.53,8402,1679,1154,5192
2,2021-07-01 01:00:00,5.05,4.73,73.68,130.56,8256,1638,1161,5016
3,2021-07-01 01:30:00,4.62,7.19,76.4,120.55,8031,1566,1146,4837
4,2021-07-01 02:00:00,4.12,7.45,78.61,125.84,7765,1502,1117,4661
5,2021-07-01 02:30:00,3.3,4.63,80.06,108.33,7476,1457,1129,4523
6,2021-07-01 03:00:00,3.85,3.21,76.6,94.59,7207,1422,1126,4429
7,2021-07-01 03:30:00,4.39,1.7,77.45,96.93,7002,1375,1125,4363
8,2021-07-01 04:00:00,0.15,0.59,75.99,92.76,6905,1354,1110,4329
9,2021-07-01 04:30:00,-0.13,5.06,71.92,84.65,6890,1338,1135,4355


In [104]:
training_vic = training_vic.set_index('Time (UTC+10)')
training_tas = training_tas.set_index('Time (UTC+10)')
training_sa = training_sa.set_index('Time (UTC+10)')
training_nsw = training_nsw.set_index('Time (UTC+10)')

In [105]:
#Check causality
max_lags=3
y='Regions VIC Trading Price ($/MWh)'
for i in range(len(training_vic.columns)-1):
    result = grangercausalitytests(training_vic[[y,training_vic.columns[i+1]]],max_lags,verbose=False)
    p_value=[round(result[i+1][0]['ssr_ftest'][1],4) for i in range(max_lags)]
    print('Column - {}:p-values - {}'.format(training_vic.columns[i+1],p_value))
max_lags=3
y='Regions TAS Trading Price ($/MWh)'
for i in range(len(training_tas.columns)-1):
    result = grangercausalitytests(training_tas[[y,training_tas.columns[i+1]]],max_lags,verbose=False)
    p_value=[round(result[i+1][0]['ssr_ftest'][1],4) for i in range(max_lags)]
    print('Column - {}:p-values - {}'.format(training_tas.columns[i+1],p_value))
y='Regions SA Trading Price ($/MWh)'
for i in range(len(training_sa.columns)-1):
    result = grangercausalitytests(training_sa[[y,training_sa.columns[i+1]]],max_lags,verbose=False)
    p_value=[round(result[i+1][0]['ssr_ftest'][1],4) for i in range(max_lags)]
    print('Column - {}:p-values - {}'.format(training_sa.columns[i+1],p_value))
y='Regions NSW Trading Price ($/MWh)'
for i in range(len(training_nsw.columns)-1):
    result = grangercausalitytests(training_nsw[[y,training_nsw.columns[i+1]]],max_lags,verbose=False)
    p_value=[round(result[i+1][0]['ssr_ftest'][1],4) for i in range(max_lags)]
    print('Column - {}:p-values - {}'.format(training_nsw.columns[i+1],p_value))

Column - Regions VIC Trading Total Intermittent Generation (MW):p-values - [0.0003, 0.0006, 0.0026]
Column - Regions VIC Operational Demand (MW):p-values - [0.0, 0.0, 0.0]
Column - Regions TAS Trading Total Intermittent Generation (MW):p-values - [0.0, 0.0, 0.0]
Column - Regions TAS Operational Demand (MW):p-values - [0.0861, 0.0, 0.0]
Column - Regions SA Trading Total Intermittent Generation (MW):p-values - [0.0, 0.0, 0.0]
Column - Regions SA Operational Demand (MW):p-values - [0.0, 0.0, 0.0]
Column - Regions NSW Trading Total Intermittent Generation (MW):p-values - [0.844, 0.0953, 0.0716]
Column - Regions NSW Operational Demand (MW):p-values - [0.0, 0.0, 0.0]


In [107]:
## The P_value of 'Trading Total Intermittent Generation' & 'Operational Demand' < 0.05
## Use these two feature to fit model and find a lag which gives lowest aic & bic

model_vic=VAR(training_vic,freq="0.5H")
model_tas=VAR(training_tas,freq="0.5H")
model_sa=VAR(training_sa,freq="0.5H")
model_nsw=VAR(training_nsw,freq="0.5H")

In [89]:
##Select order of model by checking the AIC
model_vic.select_order(250).summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,34.45,34.45,9.158e+14,34.45
1.0,26.10,26.10,2.160e+11,26.10
2.0,24.91,24.91,6.583e+10,24.91
3.0,24.90,24.90,6.491e+10,24.90
4.0,24.89,24.90,6.476e+10,24.90
5.0,24.88,24.89,6.400e+10,24.88
6.0,24.88,24.89,6.381e+10,24.88
7.0,24.88,24.89,6.361e+10,24.88
8.0,24.87,24.89,6.341e+10,24.88


In [61]:
model_tas.select_order(80).summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,26.51,26.51,3.256e+11,26.51
1.0,20.04,20.04,5.038e+08,20.04
2.0,19.23,19.24,2.257e+08,19.24
3.0,19.19,19.20,2.160e+08,19.19
4.0,19.18,19.18,2.131e+08,19.18
5.0,19.17,19.17,2.106e+08,19.17
6.0,19.16,19.17,2.085e+08,19.16
7.0,19.15,19.17,2.082e+08,19.16
8.0,19.15,19.16,2.073e+08,19.15


In [62]:
model_sa.select_order(80).summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,31.83,31.83,6.669e+13,31.83
1.0,24.27,24.28,3.481e+10,24.27
2.0,23.62,23.62,1.804e+10,23.62
3.0,23.59,23.60,1.765e+10,23.60
4.0,23.58,23.59,1.749e+10,23.59
5.0,23.58,23.59,1.747e+10,23.59
6.0,23.58,23.59,1.746e+10,23.59
7.0,23.58,23.59,1.743e+10,23.58
8.0,23.57,23.59,1.730e+10,23.58


In [66]:
model_nsw.select_order(250).summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,32.78,32.78,1.725e+14,32.78
1.0,25.56,25.57,1.266e+11,25.56
2.0,23.90,23.90,2.395e+10,23.90
3.0,23.86,23.87,2.313e+10,23.87
4.0,23.79,23.80,2.150e+10,23.79
5.0,23.78,23.79,2.128e+10,23.78
6.0,23.78,23.79,2.121e+10,23.78
7.0,23.77,23.79,2.114e+10,23.78
8.0,23.77,23.78,2.103e+10,23.77


In [108]:
## choose the lag with lowest AIC and build model
result_vic=model_vic.fit(243)
result_tas=model_vic.fit(73)
result_sa=model_vic.fit(52)
result_nsw=model_vic.fit(248)

In [109]:
## I use last number of lag rows to predict future rows
## The reason might be the data that we have is not able to completely predict price, the price is pretty sporadic.
##pridict spot price from 2021/7/1 to 2021/8/11
pridict_vic=result_vic.forecast(training_vic.values[-243:],steps=2016)[:,0]
pridict_tas=result_tas.forecast(training_tas.values[-73:],steps=2016)[:,0]
pridict_sa=result_sa.forecast(training_sa.values[-52:],steps=2016)[:,0]
pridict_nsw=result_nsw.forecast(training_nsw.values[-248:],steps=2016)[:,0]
test['Regions VIC Trading Price ($/MWh)']=np.round(pridict_vic,2)
test['Regions TAS Trading Price ($/MWh)']=np.round(pridict_tas,2)
test['Regions SA Trading Price ($/MWh)']=np.round(pridict_sa,2)
test['Regions NSW Trading Price ($/MWh)']=np.round(pridict_nsw,2)

In [110]:
test.head(10)

Unnamed: 0,Time (UTC+10),Regions NSW Trading Total Intermittent Generation (MW),Regions SA Trading Total Intermittent Generation (MW),Regions TAS Trading Total Intermittent Generation (MW),Regions VIC Trading Total Intermittent Generation (MW),Regions NSW Operational Demand (MW),Regions SA Operational Demand (MW),Regions TAS Operational Demand (MW),Regions VIC Operational Demand (MW),Regions VIC Trading Price ($/MWh),Regions TAS Trading Price ($/MWh),Regions SA Trading Price ($/MWh),Regions NSW Trading Price ($/MWh)
0,2021-07-01 00:00:00,4.35,4.76,83.7,184.6,8521,1706,1197,5410,68.85,28.73,23.41,71.1
1,2021-07-01 00:30:00,5.26,4.92,77.71,175.53,8402,1679,1154,5192,44.0,-2.73,-16.48,62.41
2,2021-07-01 01:00:00,5.05,4.73,73.68,130.56,8256,1638,1161,5016,55.59,-22.52,-43.32,82.25
3,2021-07-01 01:30:00,4.62,7.19,76.4,120.55,8031,1566,1146,4837,59.82,-34.02,-58.02,71.95
4,2021-07-01 02:00:00,4.12,7.45,78.61,125.84,7765,1502,1117,4661,51.57,-44.24,-55.64,67.79
5,2021-07-01 02:30:00,3.3,4.63,80.06,108.33,7476,1457,1129,4523,29.34,-50.53,-60.67,74.14
6,2021-07-01 03:00:00,3.85,3.21,76.6,94.59,7207,1422,1126,4429,61.57,-63.24,-74.92,72.76
7,2021-07-01 03:30:00,4.39,1.7,77.45,96.93,7002,1375,1125,4363,106.87,-68.0,-61.38,61.88
8,2021-07-01 04:00:00,0.15,0.59,75.99,92.76,6905,1354,1110,4329,122.11,-74.28,-55.86,71.3
9,2021-07-01 04:30:00,-0.13,5.06,71.92,84.65,6890,1338,1135,4355,141.86,-63.28,-52.1,88.63


In [111]:
test.to_csv('../../data/Bonus_prediction.csv')