## Exploring forecasting various home price metric with pytorch
see: https://github.com/jdb78/pytorch-forecasting

In [29]:
# imports for training
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

# import dataset, network to train and metric to optimize
from pytorch_forecasting import (
    TimeSeriesDataSet,
    TemporalFusionTransformer,
    QuantileLoss,
)

from capston_db_conn import db_conn

conn = db_conn()
# print(conn)

# standard imports
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

In [30]:
rf_sql = """
select
*
from redfin_county_full
"""
# NOTE: add filters on SQL query once determined they are needed

redfin = pd.read_sql(rf_sql, con=conn, parse_dates=["period_start", "period_end"])
# conn.close()


In [31]:
# redfin.head()

In [32]:
cols = [
    "county_fips",
    "period_end",
    "property_type",
    #"property_type_id",
    "median_sale_price",
    "median_list_price",
    "median_ppsf",
    "median_list_ppsf",
    "homes_sold",
    "pending_sales",
    "new_listings",
    "inventory",
    "months_of_supply",
    "median_dom",
    "avg_sale_to_list",
    "sold_above_list",
    "price_drops",
    "off_market_in_two_weeks",
]

In [36]:
# load data: this is pandas dataframe with at least a column for
# * the target (what you want to predict)
# * the timeseries ID (which should be a unique string to identify each timeseries)
# * the time of the observation (which should be a monotonically increasing integer)
data = redfin[cols]

# add time index
data["time_idx"] = data["period_end"].dt.year * 12 + data["period_end"].dt.month
data["time_idx"] -= data["time_idx"].min()
data["month"] = data["period_end"].dt.month.astype(str).astype("category")


In [37]:
data.head()

Unnamed: 0,county_fips,period_end,property_type,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,price_drops,off_market_in_two_weeks,time_idx,month
0,42129,2015-02-28,All Residential,118000.0,129900.0,60.869565,55.444444,199.0,56.0,239.0,,11.1,158.0,0.941476,0.095477,0.066093,0.053571,37,2
1,51057,2012-08-31,All Residential,104750.0,159925.0,74.496753,102.758612,4.0,1.0,10.0,51.0,12.8,152.0,0.926464,0.0,,0.0,7,8
2,21203,2021-01-31,Single Family Residential,123500.0,216950.0,93.489583,101.938339,6.0,5.0,4.0,28.0,4.7,51.0,0.948879,0.0,,0.0,108,1
3,34039,2021-08-31,Condo/Co-op,347500.0,449949.5,267.456359,291.364705,6.0,8.0,14.0,45.0,7.5,45.0,0.989805,0.166667,0.111111,0.0,115,8
4,37063,2019-11-30,Townhouse,255500.0,267635.0,148.613518,147.895603,89.0,70.0,63.0,253.0,2.8,47.0,0.995308,0.292135,0.189723,0.128571,94,11


In [None]:
from pytorch_forecasting.data import TimeSeriesDataSet, GroupNormalizer

# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 36
max_prediction_length = 12
training_cutoff = data["time_idx"].max() - max_prediction_length  # day for cutoff

training = TimeSeriesDataSet(
    data[lambda x: x.date <= training_cutoff],
    time_idx="time_idx"  # column name of time of observation
    target="median_sale_price",  # column name of target to predict
    group_ids=["property_type", "county_fips"],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=["property_type", "county_fips"],
    static_reals=[...],
    # covariates known and unknown in the future to inform prediction
    time_varying_known_categoricals=[],
    time_varying_known_reals=[...],
    time_varying_unknown_categoricals=[...],
    time_varying_unknown_reals=[...],
)
