# Baseline forecast: having data up to time T, the forecast for time T+1 to T+N is the value of the time T

In [2]:
import pandas as pd
import math
from sklearn.metrics import mean_squared_error
import pickle

# Import all previous files

In [24]:
# open pickle
train_df_dict = pd.read_pickle('data/train_df_dict.pkl')
train_df_dict_L = pd.read_pickle('data/train_df_dict_L.pkl')
test_df_dict = pd.read_pickle('data/test_df_dict.pkl')

# remove the NaN values from train set
train_df_dict_clean = {}
for district_no, district_df in train_df_dict.items():
    train_df_dict_clean[district_no] = district_df.dropna()

# remove the NaN values from test set
test_df_dict_clean = {}
for district_no, district_df in test_df_dict.items():
    test_df_dict_clean[district_no] = district_df.dropna()

In [46]:
# the forecast for the whole test set is the last value of the train set
def baseline_forecast(train_df, test_df):
    # get the last value of the train set
    last_value = train_df.iloc[-1]
    # create a dataframe with the same index as the test set
    forecast = pd.Series(
        [last_value] * len(test_df), index=test_df.index
    )
    return forecast

# create a dictionary with the forecast for each district
baseline_forecast_dict = {}
for district_no, train_df in train_df_dict_clean.items():
    test_df = test_df_dict_clean[district_no]
    baseline_forecast_dict[district_no] = baseline_forecast(train_df, test_df)

# calculate the RMSE for each district
baseline_rmse_dict = {}
for district_no, forecast in baseline_forecast_dict.items():
    test_df = test_df_dict_clean[district_no]
    baseline_rmse_dict[district_no] = math.sqrt(
        mean_squared_error(test_df, forecast)
    )


In [47]:
baseline_rmse_dict

{1: 80923.33914379595,
 2: 87935.36286007657,
 3: 38424.43430075885,
 4: 103630.48474266633,
 5: 72840.12645971084,
 7: 94444.8247708682,
 8: 68113.84854296029,
 10: 151364.91368433658,
 11: 58730.31585135568,
 12: 125802.90423515667,
 13: 87425.76833710604,
 14: 81622.03891515249,
 15: 60028.15602290659,
 16: 19384.003284495524,
 17: 66370.08980439049,
 18: 49620.40099606629,
 19: 60155.540494344044,
 20: 65188.051892198775,
 21: 304881.4826084064,
 22: 46597.66876143054,
 23: 54434.40241856859,
 25: 72655.85546487862,
 27: 55369.09644226943,
 28: 69407.1083487371}

In [50]:
# save the baseline rmse dictionary
with open("eval/baseline_rmse_dict.pkl", "wb") as f:
    pickle.dump(baseline_rmse_dict, f)
