In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline
import pmdarima as pm
import pickle

# Import all data

In [3]:
with open("data/resale_flat_transactions_df_grouped_dict.pkl", "rb") as f:
    resale_flat_transactions_df_grouped_dict = pickle.load(f)

with open("data/private_rental_df_grouped_dict.pkl", "rb") as f:
    private_rental_df_grouped_dict = pickle.load(f)

with open("data/private_transactions_df_grouped_dict.pkl", "rb") as f:
    private_transactions_df_grouped_dict = pickle.load(f)

with open("data/flat_rental_df_grouped_dict.pkl", "rb") as f:
    flat_rental_df_grouped_dict = pickle.load(f)

# dict of dict of dataframes
data_dict = {
    "resale_flat_transactions": resale_flat_transactions_df_grouped_dict,
    "private_rental": private_rental_df_grouped_dict,
    "private_transactions": private_transactions_df_grouped_dict,
    "flat_rental": flat_rental_df_grouped_dict
}

# load models
with open("models/SARIMAX_models.pkl", "rb") as f:
    SARIMAX_models = pickle.load(f)

with open("models/SARIMA_models.pkl", "rb") as f:
    SARIMA_models = pickle.load(f)


# find out which are the best models for each district are

In [4]:
evaluations_df = pd.read_csv('eval/SARIMA_SARIMAX_LSTM_evaluations.csv', index_col=0)

# convert Best model to list
evaluations_df['Best Model'] = evaluations_df['Best Model'].apply(lambda x: eval(x))

evaluations_df

Unnamed: 0,RMSE_SARIMAX,RMSE_SARIMAX_L,RMSE_SARIMA,RMSE_SARIMA_L,RMSE_Baseline,Best Model,LSTM,LSTM_LI
1,55011.41962,80023.212541,80701.954465,80023.212541,80023.212541,[RMSE_SARIMAX],452249.188317,149832.667778
2,165305.580841,169804.96623,165305.580841,169804.96623,104978.301266,[RMSE_Baseline],942151.447461,481880.597194
3,46309.954068,46309.954068,46309.954068,46309.954068,50876.39239,"[RMSE_SARIMAX, RMSE_SARIMAX_L, RMSE_SARIMA, RM...",187180.716545,187180.716545
4,71366.794416,71366.794416,71366.794416,71366.794416,81127.689438,"[RMSE_SARIMAX, RMSE_SARIMAX_L, RMSE_SARIMA, RM...",270948.225586,270948.225586
5,89702.490928,89702.490928,89702.490928,89702.490928,89702.490928,[RMSE_Baseline],244527.719401,244527.719401
7,114492.833878,116480.290194,114492.833878,116480.290194,137234.259556,"[RMSE_SARIMAX, RMSE_SARIMA]",478284.915731,188147.553663
8,66238.380103,69236.24035,66238.380103,69236.24035,77919.581903,"[RMSE_SARIMAX, RMSE_SARIMA]",572042.354448,187852.819913
10,140841.580233,140841.580233,140841.580233,140841.580233,119792.108558,[RMSE_Baseline],275993.260684,275993.260684
12,57111.964315,57111.964315,57111.964315,57111.964315,62698.457674,"[RMSE_SARIMAX, RMSE_SARIMAX_L, RMSE_SARIMA, RM...",346430.901811,346430.901811
13,78583.628941,78583.628941,78583.628941,78583.628941,112172.824089,"[RMSE_SARIMAX, RMSE_SARIMAX_L, RMSE_SARIMA, RM...",223126.427789,223126.427789


In [5]:
# favor SARIMA over SARIMAX

def pick_simple_model(x):
    rank = ['RMSE_Baseline', 'RMSE_SARIMA', 'RMSE_SARIMAX', 'RMSE_SARIMA_L', 'RMSE_SARIMAX_L', 'RMSE_LSTM']
    for i in rank:
        if i in x:
            return i

evaluations_df['Best Model'] = evaluations_df['Best Model'].apply(pick_simple_model) # type: ignore

evaluations_df

Unnamed: 0,RMSE_SARIMAX,RMSE_SARIMAX_L,RMSE_SARIMA,RMSE_SARIMA_L,RMSE_Baseline,Best Model,LSTM,LSTM_LI
1,55011.41962,80023.212541,80701.954465,80023.212541,80023.212541,RMSE_SARIMAX,452249.188317,149832.667778
2,165305.580841,169804.96623,165305.580841,169804.96623,104978.301266,RMSE_Baseline,942151.447461,481880.597194
3,46309.954068,46309.954068,46309.954068,46309.954068,50876.39239,RMSE_SARIMA,187180.716545,187180.716545
4,71366.794416,71366.794416,71366.794416,71366.794416,81127.689438,RMSE_SARIMA,270948.225586,270948.225586
5,89702.490928,89702.490928,89702.490928,89702.490928,89702.490928,RMSE_Baseline,244527.719401,244527.719401
7,114492.833878,116480.290194,114492.833878,116480.290194,137234.259556,RMSE_SARIMA,478284.915731,188147.553663
8,66238.380103,69236.24035,66238.380103,69236.24035,77919.581903,RMSE_SARIMA,572042.354448,187852.819913
10,140841.580233,140841.580233,140841.580233,140841.580233,119792.108558,RMSE_Baseline,275993.260684,275993.260684
12,57111.964315,57111.964315,57111.964315,57111.964315,62698.457674,RMSE_SARIMA,346430.901811,346430.901811
13,78583.628941,78583.628941,78583.628941,78583.628941,112172.824089,RMSE_SARIMA,223126.427789,223126.427789


In [6]:
evaluations_df['Best Model'].value_counts()

RMSE_Baseline    13
RMSE_SARIMA       8
RMSE_SARIMAX      2
Name: Best Model, dtype: int64

In [7]:
# total of 28 * 4 = 112 models

rank = [
    "RMSE_Baseline",
    "RMSE_SARIMA",
    "RMSE_SARIMAX",
    "RMSE_SARIMA_L",
    "RMSE_SARIMAX_L",
    "RMSE_LSTM",
]

# fit data with sarima of defined order and seasonal order and return model
def sarima_fitter(data, order):
    # order (p,d,q), seasonal_order (P,D,Q,s)
    # initalize model with order and seasonal order
    model = pm.arima.ARIMA(
        order=order,
        seasonal_order=(0, 0, 0, 0)
    )
    model.fit(data)
    return model

# fit data with sarimax of defined order and seasonal order and return model
def sarimax_fitter(data, exo, order, seasonal_order):
    # order (p,d,q), seasonal_order (P,D,Q,s)
    # initalize model with order and seasonal order
    model = pm.arima.ARIMA(
        order=order,
        seasonal_order=seasonal_order
    )
    model.fit(data, X=exo)
    return model

# baseline fitter returns a baseline model
def baseline_fitter(data):
    # initalize model with order and seasonal order
    model = pm.arima.ARIMA(order=(0, 1, 0), seasonal_order=(0, 0, 0, 0))
    model.fit(data)
    return model


In [8]:
# create a dictionary of district and best model type
district_best_model = dict(zip(evaluations_df.index, evaluations_df['Best Model']))

# for all districts and for all data, fit the best model and save the model
for district in district_best_model:
    for price_name, price_dict in data_dict.items():
        # get the best model for the district
        best_model = district_best_model[district]
        # get the data for the district
        data = price_dict[district]
        prices = data['price']
        features = data.drop(columns='price')
        model = None
        # fit the model
        if best_model == "RMSE_Baseline":
            model = baseline_fitter(prices)
        elif best_model == "RMSE_SARIMA":
            order = SARIMA_models[district]['order']
            model = sarima_fitter(prices, order)
        elif best_model == "RMSE_SARIMAX":
            order = SARIMAX_models[district]['order']
            seasonal_order = SARIMAX_models[district]['seasonal_order']
            model = sarimax_fitter(prices, features, order, seasonal_order)
        # save the model
        with open(f"final_models/{price_name}/{district}_{price_name}_{best_model}.pkl", "wb") as f:
            pickle.dump(model, f)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_date