### Forecasting with SARIMAX for the whole dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape
import statsmodels.api as sm
import pickle

In [2]:
order = (5, 1, 1)  # p, d, q
seasonal_order = (1, 1, 1, 7) # P, D, Q, m

In [3]:
# all data file
with open('all_data.pkl', 'rb') as f:
    df = pickle.load(f)
# df.head(2)

In [4]:
# returns neighbors node names for a given node
def get_neighbors(data, n):
    t = data[data['node']==n]['neib_node'].values.flatten().tolist()[0]
    return t

In [5]:
# returns prices list for a give node
def get_prices(data, n):
    prices_cols = data.columns[4:]
    t = data[data['node']==n][prices_cols].values.flatten().tolist()
    return t

In [6]:
# Forecasts and returns RMSE and MAPE for a provided dataframe without neighbors
def forecast_SARIMAX(data, forecast_days):
    data = data.copy()
    
    # test/train split
    train, test = data[:-forecast_days], data[-forecast_days:] 
    # model
    model_fit = sm.tsa.statespace.SARIMAX(endog=train['price'], 
                                          order=order, 
                                          seasonal_order = seasonal_order, 
                                         enforce_stationarity=False
                                         # enforce_invertibility=False
                                         ).fit()
    yhat = model_fit.forecast(forecast_days)
    # accuracy metrix
    rmse_res = mse(test['price'], yhat, squared=False)
    mape_res = mape(test['price'], yhat)
    
    return round(rmse_res, 2), round(mape_res, 3)

In [7]:
# Forecasts and returns RMSE and MAPE for a provided dataframe with neightbors
def forecast_SARIMAX_lag(data, forecast_days, lag=366, neighbors=[]):
    data = data.copy()
    
    #creating lag columns for neibghors prices and shifting
    for neib in neighbors:
        data[neib] = data[neib].shift(lag)
    # removing NaNs
    data.dropna(inplace = True)
    data.reset_index(inplace = True, drop=True)

    # test/train split
    train, test = data[:-forecast_days], data[-forecast_days:] 
    # model
    model_fit = sm.tsa.statespace.SARIMAX(endog=train['price'], 
                                          exog=train[neighbors],
                                          order=order, 
                                          seasonal_order = seasonal_order, 
                                         enforce_stationarity=False).fit()
    yhat = model_fit.forecast(forecast_days, exog=test[neighbors])
    # accuracy metrix
    rmse_res = mse(test['price'], yhat, squared=False)
    mape_res = mape(test['price'], yhat)
    
    return round(rmse_res, 2), round(mape_res, 3)

In [8]:
warnings.filterwarnings('ignore')
# settings
number_of_neighbors = 10
list_of_nodes = df['node'].tolist() # [100419] #
dates = pd.date_range(start='2019-12-05', periods = 1429)
forecast_size=30

#resulting dict and aux counters
# result = {'node':[], 'RMSE_1':[], 'MAPE_1':[], 'RMSE_2':[], 'MAPE_2':[]}
with open('all_data_SAR_10_neighb.pkl', 'rb') as f:
    result = pickle.load(f)

balance = 0 # 
k=len(list_of_nodes)

for node in list_of_nodes:
#     print(node)
    if (node in result['node']):
        print("skipping:", node)
        k-=1
        continue
    if (node in [402159, 403038, 403987]):
        continue
        # skipping faulty node
    
   
    # print("working for:", node)
    prices = get_prices(df, node)
    curr_df = pd.DataFrame(data = {'date':dates, 'price':prices})
    # add neighbors
    neighbors = get_neighbors(df, node)[:number_of_neighbors]
    neighb_to_remove = []
    for neighbor in neighbors:
        p = get_prices(df, neighbor)
        if (len(p) == 1429):
            curr_df[neighbor] = p
        else: 
            neighb_to_remove.append(neighbor)
    neighbors = list(set(neighbors) - set(neighb_to_remove))
    # curr_df.dropna()

    for neighbor in neighbors:
        curr_df[neighbor] = get_prices(df, neighbor)
        
    #forecast w/o neighbors
    rmse_1, mape_1 = forecast_SARIMAX (curr_df, forecast_size)
    #print("RMSE_1:", rmse_1, "MAPE_1:", mape_1)
    
    #forecast with neighbors
    rmse_2, mape_2 = forecast_SARIMAX_lag(curr_df, forecast_size, neighbors = neighbors)
    #print("RMSE_2:", rmse_2, "MAPE_2:", mape_2)
    
    result['node'].append(node)
    result['RMSE_1'].append(rmse_1)
    result['MAPE_1'].append(mape_1)
    result['RMSE_2'].append(rmse_2)
    result['MAPE_2'].append(mape_2)
    # interim results
    if (rmse_1 > rmse_2):
        balance +=1
    else: balance-=1
        
    k-=1   
    print("Node:",str(node) + " --> k:"+ str(k) + " --> Success:"+str(balance)+"   ", end="\r")

skipping: 100001
skipping: 100003
skipping: 100004
skipping: 100005
skipping: 100006
skipping: 100007
skipping: 100008
skipping: 100009
skipping: 100010
skipping: 100011
skipping: 100012
skipping: 100013
skipping: 100015
skipping: 100016
skipping: 100017
skipping: 100018
skipping: 100019
skipping: 100020
skipping: 100021
skipping: 100022
skipping: 100023
skipping: 100024
skipping: 100025
skipping: 100026
skipping: 100027
skipping: 100028
skipping: 100029
skipping: 100030
skipping: 100031
skipping: 100032
skipping: 100033
skipping: 100035
skipping: 100036
skipping: 100037
skipping: 100038
skipping: 100039
skipping: 100040
skipping: 100041
skipping: 100042
skipping: 100043
skipping: 100044
skipping: 100045
skipping: 100046
skipping: 100047
skipping: 100048
skipping: 100049
skipping: 100050
skipping: 100051
skipping: 100052
skipping: 100053
skipping: 100054
skipping: 100055
skipping: 100056
skipping: 100057
skipping: 100058
skipping: 100059
skipping: 100060
skipping: 100061
skipping: 1000

skipping: 205571
skipping: 205572
skipping: 205573
skipping: 205574
skipping: 205575
skipping: 205576
skipping: 205577
skipping: 205578
skipping: 205579
skipping: 205580
skipping: 205582
skipping: 205584
skipping: 205586
skipping: 205590
skipping: 205591
skipping: 205592
skipping: 205593
skipping: 205594
skipping: 205596
skipping: 205601
skipping: 205602
skipping: 205603
skipping: 205604
skipping: 205605
skipping: 205606
skipping: 205608
skipping: 205609
skipping: 205610
skipping: 205612
skipping: 205615
skipping: 205618
skipping: 205619
skipping: 205620
skipping: 205622
skipping: 205624
skipping: 205625
skipping: 205627
skipping: 205630
skipping: 205631
skipping: 205636
skipping: 205643
skipping: 205647
skipping: 205666
skipping: 205690
skipping: 205698
skipping: 205701
skipping: 205702
skipping: 205703
skipping: 205704
skipping: 205705
skipping: 205706
skipping: 205708
skipping: 205709
skipping: 205710
skipping: 205711
skipping: 205712
skipping: 205713
skipping: 205715
skipping: 2057

skipping: 301249
skipping: 301251
skipping: 301252
skipping: 320001
skipping: 320002
skipping: 320003
skipping: 320101
skipping: 320102
skipping: 320111
skipping: 320119
skipping: 320120
skipping: 320121
skipping: 320125
skipping: 320126
skipping: 320130
skipping: 320131
skipping: 320132
skipping: 320133
skipping: 320134
skipping: 320135
skipping: 320137
skipping: 320138
skipping: 320143
skipping: 320144
skipping: 320146
skipping: 320147
skipping: 320170
skipping: 320176
skipping: 320410
skipping: 320411
skipping: 320419
skipping: 320436
skipping: 320453
skipping: 320454
skipping: 320455
skipping: 320456
skipping: 320457
skipping: 320576
skipping: 320577
skipping: 320578
skipping: 320579
skipping: 320580
skipping: 320581
skipping: 320582
skipping: 320584
skipping: 320585
skipping: 320586
skipping: 320587
skipping: 320589
skipping: 320590
skipping: 320591
skipping: 320593
skipping: 320594
skipping: 320596
skipping: 320598
skipping: 320599
skipping: 320701
skipping: 320702
skipping: 3207

skipping: 403394
skipping: 403395
skipping: 403400
skipping: 403401
skipping: 403402
skipping: 403403
skipping: 403404
skipping: 403405
skipping: 403406
skipping: 403407
skipping: 403409
skipping: 403410
skipping: 403411
skipping: 403412
skipping: 403413
skipping: 403414
skipping: 403416
skipping: 403417
skipping: 403418
skipping: 403419
skipping: 403420
skipping: 403421
skipping: 403422
skipping: 403423
skipping: 403424
skipping: 403425
skipping: 403426
skipping: 403427
skipping: 403428
skipping: 403429
skipping: 403430
skipping: 403432
skipping: 403433
skipping: 403435
skipping: 403436
skipping: 403437
skipping: 403470
skipping: 403471
skipping: 403482
skipping: 403483
skipping: 403484
skipping: 403495
skipping: 403496
skipping: 403497
skipping: 403498
skipping: 403499
skipping: 403500
skipping: 403501
skipping: 403510
skipping: 403512
skipping: 403514
skipping: 403515
skipping: 403528
skipping: 403530
skipping: 403531
skipping: 403533
skipping: 403534
skipping: 403535
skipping: 4035

KeyboardInterrupt: 

In [None]:
node

In [9]:
with open('all_data_SAR_10_neighb.pkl', 'wb') as f:
    pickle.dump(result, f)

In [10]:
df = pd.DataFrame.from_dict(result)

In [11]:
df['RMSE_1'].mean(), df['RMSE_2'].mean()

(213.23394729618448, 220.29946198188307)