## Exploring forecasting various home price metric with pytorch
see: https://github.com/jdb78/pytorch-forecasting

In [24]:
# imports for training
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

# import dataset, network to train and metric to optimize
from pytorch_forecasting import (
    TimeSeriesDataSet,
    TemporalFusionTransformer,
    QuantileLoss,
)

from capston_db_conn import db_conn

conn = db_conn()
# print(conn)

# standard imports
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

In [25]:
rf_sql = """
select
*
from redfin_county_full
"""
# NOTE: add filters on SQL query once determined they are needed

redfin = pd.read_sql(rf_sql, con=conn, parse_dates=["period_start", "period_end"])
# conn.close()


In [26]:
# redfin.head()

In [27]:
cols = [
    "county_fips",
    "period_end",
    "property_type",
    #"property_type_id",
    "median_sale_price",
    "median_list_price",
    "median_ppsf",
    "median_list_ppsf",
    "homes_sold",
    "pending_sales",
    "new_listings",
    "inventory",
    "months_of_supply",
    "median_dom",
    "avg_sale_to_list",
    "sold_above_list",
    "price_drops",
    "off_market_in_two_weeks",
]

In [28]:
# load data: this is pandas dataframe with at least a column for
# * the target (what you want to predict)
# * the timeseries ID (which should be a unique string to identify each timeseries)
# * the time of the observation (which should be a monotonically increasing integer)
data = redfin[cols]

# add time index
data["time_idx"] = data["period_end"].dt.year * 12 + data["period_end"].dt.month
data["time_idx"] -= data["time_idx"].min()
data["month"] = data.date.dt.month.astype(str).astype("category")


AttributeError: 'DataFrame' object has no attribute 'date'

In [None]:
# data.head()

In [None]:
from pytorch_forecasting.data import TimeSeriesDataSet, GroupNormalizer

# define the dataset, i.e. add metadata to pandas dataframe for the model to understand it
max_encoder_length = 36
max_prediction_length = 12
training_cutoff = data["time_idx"].max() - max_prediction_length  # day for cutoff

training = TimeSeriesDataSet(
    data[lambda x: x.date <= training_cutoff],
    time_idx="time_idx"  # column name of time of observation
    target="median_sale_price",  # column name of target to predict
    group_ids=["property_type", "county_fips"],  # column name(s) for timeseries IDs
    max_encoder_length=max_encoder_length,  # how much history to use
    max_prediction_length=max_prediction_length,  # how far to predict into future
    # covariates static for a timeseries ID
    static_categoricals=["property_type", "county_fips"],
    static_reals=[...],
    # covariates known and unknown in the future to inform prediction
    time_varying_known_categoricals=[],
    time_varying_known_reals=[...],
    time_varying_unknown_categoricals=[...],
    time_varying_unknown_reals=[...],
)
