# 2 - Baseline model

- A rather naive baseline is to predict d(t+1), d(t+2) .. d(t+5) as the value d(t)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_pickle('../data/0-df.pkl')

In [3]:
train_df = df[df.day <= 50]
eval_df = df[df.day > 50]

In [4]:
def calculate_rmse(x, y):
    return np.sum(np.absolute(x - y))/x.shape[0]

def calculate_smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.nanmean(diff)

In [5]:
# USE PREVIOUS DATAPOINT AS PREDICT VAL
baseline_rmse = []
for gh in eval_df.geohash6.unique():
    demand = eval_df[eval_df.geohash6 == gh].demand.values
    
    rmse_t_1 = calculate_rmse(demand[:-5], demand[1:-4])
    rmse_t_2 = calculate_rmse(demand[:-5], demand[2:-3])
    rmse_t_3 = calculate_rmse(demand[:-5], demand[3:-2])
    rmse_t_4 = calculate_rmse(demand[:-5], demand[4:-1])
    rmse_t_5 = calculate_rmse(demand[:-5], demand[5:])
    
    smape_t_1 = calculate_smape(demand[:-5], demand[1:-4])
    smape_t_2 = calculate_smape(demand[:-5], demand[2:-3])
    smape_t_3 = calculate_smape(demand[:-5], demand[3:-2])
    smape_t_4 = calculate_smape(demand[:-5], demand[4:-1])
    smape_t_5 = calculate_smape(demand[:-5], demand[5:])
    
    baseline_rmse.append({
        'geohash6': gh,
        'rmse(t+1)': rmse_t_1,
        'rmse(t+2)': rmse_t_2,
        'rmse(t+3)': rmse_t_3,
        'rmse(t+4)': rmse_t_4,
        'rmse(t+5)': rmse_t_5,
        'smape(t+1)': smape_t_1,
        'smape(t+2)': smape_t_2,
        'smape(t+3)': smape_t_3,
        'smape(t+4)': smape_t_4,
        'smape(t+5)': smape_t_5,
    })

baseline_rmse_df = pd.DataFrame(baseline_rmse)
baseline_rmse_df.describe()

  


Unnamed: 0,rmse(t+1),rmse(t+2),rmse(t+3),rmse(t+4),rmse(t+5),smape(t+1),smape(t+2),smape(t+3),smape(t+4),smape(t+5)
count,1329.0,1329.0,1329.0,1329.0,1329.0,1329.0,1329.0,1329.0,1329.0,1329.0
mean,0.013834,0.016883,0.018968,0.020827,0.022719,42.098473,47.293172,50.143613,52.520402,54.797069
std,0.012506,0.015951,0.018837,0.021662,0.024632,22.140788,24.159773,25.002974,25.662177,26.237707
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.003495,0.004101,0.00429,0.004392,0.004505,25.949403,30.750037,32.94461,35.290293,37.799933
50%,0.011329,0.013164,0.014004,0.014685,0.01537,44.076585,50.214083,54.137186,57.64773,61.374586
75%,0.020779,0.025035,0.027513,0.029919,0.032117,59.777084,66.124484,69.072297,71.632403,74.284731
max,0.06716,0.086754,0.106129,0.124682,0.144874,97.76401,105.237471,105.171687,110.197537,113.192732


In [6]:
baseline_rmse_df.to_csv('../metrics/baseline_using_prev_datapoint.csv', index=False)