# Baseline forecast: having data up to time T, the forecast for time T+1 to T+N is the value of the time T

In [4]:
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
import pickle

# Import all previous files

In [5]:
# open pickle
train_df_dict = pd.read_pickle('data/train_df_dict.pkl')
train_df_dict_L = pd.read_pickle('data/train_df_dict_L.pkl')
test_df_dict = pd.read_pickle('data/test_df_dict.pkl')

# remove the NaN values from train set
train_df_dict_clean = {}
for district_no, district_df in train_df_dict.items():
    train_df_dict_clean[district_no] = district_df.dropna()

# remove the NaN values from test set
test_df_dict_clean = {}
for district_no, district_df in test_df_dict.items():
    test_df_dict_clean[district_no] = district_df.dropna()

In [7]:
def baseline_forecast(train_df, test_df):
    # forecast at t is the value at t-1
    forecast = test_df.shift(1)
    # first forecast of the test set is the last value of the train set
    forecast.iloc[0] = train_df.iloc[-1]
    return forecast

# create a dictionary with the forecast for each district
baseline_forecast_dict = {}
for district_no, train_df in train_df_dict_clean.items():
    test_df = test_df_dict_clean[district_no]
    baseline_forecast_dict[district_no] = baseline_forecast(train_df, test_df)

# calculate the RMSE for each district
baseline_rmse_dict = {}
for district_no, forecast in baseline_forecast_dict.items():
    test_df = test_df_dict_clean[district_no]
    baseline_rmse_dict[district_no] = math.sqrt(
        mean_squared_error(test_df, forecast)
    )


In [8]:
baseline_rmse_dict

{1: 80023.2125414529,
 2: 104978.30126586478,
 3: 50876.392390446345,
 4: 81127.68943770061,
 5: 89702.49092788152,
 7: 137234.25955642416,
 8: 77919.58190339577,
 10: 119792.10855756178,
 11: 43545.95273960601,
 12: 62698.45767374718,
 13: 112172.8240885465,
 14: 83652.92076988904,
 15: 87039.20968741857,
 16: 18111.34543134036,
 17: 33465.83223229328,
 18: 16284.70826739511,
 19: 7768.517898115016,
 20: 76508.00394947795,
 21: 265477.3989909499,
 22: 16070.31047200603,
 23: 11246.758792351391,
 25: 19014.59965395012,
 27: 12410.117445052645,
 28: 16806.114561869043}

In [9]:
# save the baseline rmse dictionary
with open("eval/baseline_rmse_dict.pkl", "wb") as f:
    pickle.dump(baseline_rmse_dict, f)
