### Forecasting with XGBoost for the whole dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pickle
import pandas as pd
import xgboost as xgb 
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

In [2]:
# all data file
with open('all_data.pkl', 'rb') as f:
    df = pickle.load(f)
# df.head(2)

In [3]:
# returns neighbors node names for a given node
def get_neighbors(data, n):
    t = data[data['node']==n]['neib_node'].values.flatten().tolist()[0]
    return t

In [4]:
# returns prices list for a give node
def get_prices(data, n):
    prices_cols = data.columns[4:]
    t = data[data['node']==n][prices_cols].values.flatten().tolist()
    return t

In [5]:
# Forecasts and returns RMSE and MAPE for a provided dataframe
def forecast_XGBoost(data, forecast_days, lags=[], neighbors=[], neighbors_lag=366):
    data = data.copy()
    #creating lag columns for poi price and shifting 
    for i_lag in lags:
        data['price_lag_'+str(i_lag)] = data['price'].shift(i_lag)
    #creating lag columns for neibghors prices and shifting
    for neib in neighbors:
        data[neib] = data[neib].shift(neighbors_lag)
    # removing NaNs
    data.dropna(inplace = True)
    # test/train split
    train, test = data[:-forecast_days], data[-forecast_days:] 
    # preparing features list
    lags_col_names = ['price_lag_'+str(x) for x in lags] #column names
    features = ['day_of_week', 'month', 'year'] + lags_col_names + neighbors
    target = 'price'   
    # test/train datasets  
    X_train, y_train = train [features], train [target]
    X_test, y_test = test [features], test [target]
    # model
    model = xgb.XGBRegressor(booster= 'gbtree', base_score=0.5, max_depth=3, early_stopping_rounds=50)
    model = model.fit(X_train, y_train,
              eval_set=[(X_train, y_train)],
              verbose = False)
    yhat = model.predict (X_test)
    # accuracy metrix
    rmse_res = mse(y_test, yhat, squared=False)
    mape_res = mape(y_test, yhat)
    
    return round(rmse_res, 2), round(mape_res, 3)

In [6]:
# all data file
with open('all_data.pkl', 'rb') as f:
    df = pickle.load(f)
# settings
lags = [366]
number_of_neighbors = 1
list_of_nodes = df['node'].tolist() #[100419]
dates = pd.date_range(start='2019-12-05', periods = 1429)
forecast_size=30

#resulting dict and aux counters
result = {'node':[], 'RMSE_1':[], 'MAPE_1':[], 'RMSE_2':[], 'MAPE_2':[]}
balance = 0 # 
k=len(list_of_nodes)

for node in list_of_nodes:
    # print("working for:", node)
    prices = get_prices(df, node)
    curr_df = pd.DataFrame(data = {'data':dates, 'price':prices})
    curr_df['day_of_week'] = curr_df['data'].dt.dayofweek
    curr_df['month'] = curr_df['data'].dt.month
    curr_df['year'] = curr_df['data'].dt.year
    # add neighbors
    neighbors = get_neighbors(df, node)[:number_of_neighbors]
    neighb_to_remove = []
    for neighbor in neighbors:
        p = get_prices(df, neighbor)
        if (len(p) == 1429):
            curr_df[neighbor] = p
        else: 
            neighb_to_remove.append(neighbor)
    neighbors = list(set(neighbors) - set(neighb_to_remove))
    # curr_df.dropna()
    
#     #forecast w/o poi price lag
#     rmse_ret_base, mape_ret_base = forecast_XGBoost (curr_df, forecast_size)
#     print("RMSE base:", rmse_ret_base, "MAPE base:", mape_ret_base)
    
    #forecast with poi price lag(s)
    rmse_1, mape_1 = forecast_XGBoost (curr_df, forecast_size, lags)
    # print("RMSE base:", rmse_1, "MAPE base:", mape_1)
    
    #forecast with poi price lag(s) and neighbors
    rmse_2, mape_2 = forecast_XGBoost (curr_df, forecast_size, lags, neighbors = neighbors)
    # print("RMSE base:", rmse_2, "MAPE base:", mape_2)
    
    result['node'].append(node)
    result['RMSE_1'].append(rmse_1)
    result['MAPE_1'].append(mape_1)
    result['RMSE_2'].append(rmse_2)
    result['MAPE_2'].append(mape_2)
    # interim results
    if (rmse_1 > rmse_2):
        balance +=1
    else: balance-=1
        
    k-=1   
    print("Npde:",str(node) + " --> k:"+ str(k) + " --> Success:"+str(balance)+"   ", end="\r")

Npde: 1006113 --> k:0 --> Success:-527     

In [7]:
with open('all_data_XGB_1_neighb.pkl', 'wb') as f:
    pickle.dump(result, f)

In [8]:
df = pd.DataFrame.from_dict(result)

In [9]:
df['RMSE_1'].median(), df['RMSE_2'].median()

(135.09, 135.49)