In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import math
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from pprint import pprint
import pickle
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torch.utils.data import Dataset, DataLoader

# map district name to district number
district_mapping_path = "../airflow/dags/data/districts.xlsx"

district_mapping = pd.read_excel(district_mapping_path)


def get_district_name(district_no):
    return district_mapping[district_mapping["Postal District"] == district_no][
        "General Location"
    ].values[0]

# Import all previous files

In [144]:
# open pickle
train_df_dict = pd.read_pickle('data/train_df_dict.pkl')
train_df_dict_L = pd.read_pickle('data/train_df_dict_L.pkl')
test_df_dict = pd.read_pickle('data/test_df_dict.pkl')

# import resale_flat_transactions_clean
resale_flat_transactions_clean = pd.read_csv('data/resale_flat_transactions_clean.csv')

# import features 
all_district_var_ts = pd.read_pickle('data/all_district_var_ts.pkl')

# remove the NaN values from train set
# train_df_dict_clean = {}
# for district_no, district_df in train_df_dict.items():
#     train_df_dict_clean[district_no] = district_df.dropna()

# hopefully lstm can handle the NaN values

# merge the train set with the features
train_lstm_df_dict = {}
for district_no, district_df in train_df_dict.items():
    train_lstm_df_dict[district_no] = district_df.to_frame().merge(all_district_var_ts[district_no], left_index=True, right_index=True, how='left')

train_lstm_df_dict_L = {}
for district_no, district_df in train_df_dict_L.items():
    train_lstm_df_dict_L[district_no] = district_df.to_frame().merge(all_district_var_ts[district_no], left_index=True, right_index=True, how='left')

# merge the test set with the features
test_lstm_df_dict = {}
for district_no, district_df in test_df_dict.items():
    test_lstm_df_dict[district_no] = district_df.to_frame().merge(all_district_var_ts[district_no], left_index=True, right_index=True, how='left')


In [145]:
train_lstm_df_dict[1].head()

Unnamed: 0_level_0,resale_price,resale_price_std,floor_area_sqm_median,remaining_lease_years_median,max_floor_lvl_median,precinct_pavilion_sum,commercial_sum,market_hawker_sum,miscellaneous_sum
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,430000.0,,60.0,61.666667,18.0,0.0,0.0,0.0,0.0
2018-02-01,,,,,,,,,
2018-03-01,515000.0,63639.610307,98.5,58.375,20.5,0.0,0.0,0.0,0.0
2018-04-01,420000.0,137790.904393,65.0,64.916667,21.0,0.0,0.0,0.0,0.0
2018-05-01,447500.0,85559.920524,67.5,58.208333,20.5,0.0,0.0,0.0,0.0


# Normalize the data

In [149]:
# create scalers for each district
scaler_dict = {}
for district_no, district_df in train_lstm_df_dict.items():
    scaler_dict[district_no] = MinMaxScaler(feature_range=(-1, 1))
    scaler_dict[district_no].fit(district_df)

# create scalers for each district
scaler_dict_L = {}
for district_no, district_df in train_lstm_df_dict_L.items():
    scaler_dict_L[district_no] = MinMaxScaler(feature_range=(-1, 1))
    scaler_dict_L[district_no].fit(district_df)

# transform the train set
train_lstm_df_dict_scaled = {}
for district_no, district_df in train_lstm_df_dict.items():
    train_lstm_df_dict_scaled[district_no] = pd.DataFrame(
        scaler_dict[district_no].transform(district_df),
        columns=district_df.columns,
        index=district_df.index,
    )

# transform the train set
train_lstm_df_dict_scaled_L = {}
for district_no, district_df in train_lstm_df_dict_L.items():
    train_lstm_df_dict_scaled_L[district_no] = pd.DataFrame(
        scaler_dict_L[district_no].transform(district_df),
        columns=district_df.columns,
        index=district_df.index,
    )

# transform the test set
test_lstm_df_dict_scaled = {}
for district_no, district_df in test_lstm_df_dict.items():
    test_lstm_df_dict_scaled[district_no] = pd.DataFrame(
        scaler_dict[district_no].transform(district_df),
        columns=district_df.columns,
        index=district_df.index,
    )


In [151]:
train_lstm_df_dict_scaled[1].head()

Unnamed: 0_level_0,resale_price,resale_price_std,floor_area_sqm_median,remaining_lease_years_median,max_floor_lvl_median,precinct_pavilion_sum,commercial_sum,market_hawker_sum,miscellaneous_sum
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,-0.434276,,-0.811765,0.4,-0.217391,-1.0,-1.0,-1.0,-1.0
2018-02-01,,,,,,,,,
2018-03-01,0.131448,-0.369697,1.0,-0.207692,0.217391,-1.0,-1.0,-1.0,-1.0
2018-04-01,-0.500832,0.746614,-0.576471,1.0,0.304348,-1.0,-1.0,-1.0,-1.0
2018-05-01,-0.317804,-0.039697,-0.458824,-0.238462,0.217391,-1.0,-1.0,-1.0,-1.0


In [167]:
train_lstm_df_dict_scaled_L[1].head()

Unnamed: 0_level_0,resale_price,resale_price_std,floor_area_sqm_median,remaining_lease_years_median,max_floor_lvl_median,precinct_pavilion_sum,commercial_sum,market_hawker_sum,miscellaneous_sum
month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01,-0.434276,,-0.811765,0.4,-0.217391,-1.0,-1.0,-1.0,-1.0
2018-02-01,-0.151414,,,,,,,,
2018-03-01,0.131448,-0.369697,1.0,-0.207692,0.217391,-1.0,-1.0,-1.0,-1.0
2018-04-01,-0.500832,0.746614,-0.576471,1.0,0.304348,-1.0,-1.0,-1.0,-1.0
2018-05-01,-0.317804,-0.039697,-0.458824,-0.238462,0.217391,-1.0,-1.0,-1.0,-1.0


# Prepare input and output for LSTM

In [152]:
# prepare data for LSTM

# seq consists of the features and previous observations N through T where T-N is the lookback period
# lookback period is 12 months

# target consists of the observations T+1 through T+K where K is the number of steps ahead to predict (lookahead period)
# lookahead period is 3 months

# create a special dataset for LSTM
def create_dataset(X, Y, target_col, look_back):
    # dataX, dataY = [], []
    seqs = []
    for i in range(len(X)-look_back-2):
        curr_seq = (X[i:(i + look_back)])
        # dataY.append(Y[(i + look_back):(i + look_back)])
        label = Y.iloc[(i + look_back): (i+ look_back + 3)][target_col].values
        seqs.append((curr_seq, label))
    return seqs

In [153]:
train_lstm_seqs = {}
for district_no, district_df in train_lstm_df_dict.items():
    train_lstm_seqs[district_no] = create_dataset(district_df, district_df, 'resale_price', 12)

train_lstm_seqs_L = {}
for district_no, district_df in train_lstm_df_dict_L.items():
    train_lstm_seqs_L[district_no] = create_dataset(district_df, district_df, 'resale_price', 12)

test_lstm_seqs = {}
for district_no, district_df in test_lstm_df_dict.items():
    test_lstm_seqs[district_no] = create_dataset(district_df, district_df, 'resale_price', 12)


In [154]:
print(train_lstm_seqs[1][0][0].shape) # 12 time steps lookback, 9 features 

(12, 9)


In [155]:
print(train_lstm_seqs[1][0][1].shape) # 3 steps ahead

(3,)


In [156]:
print(len(train_lstm_seqs[1])) # 36 sequences

25


# Pytorch Dataset

In [157]:
class PriceDataset(Dataset):
    def __init__(self, seqs):
        self.seqs = seqs

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq, label = self.seqs[idx]
        return dict(
            seq=torch.tensor(seq.to_numpy(), dtype=torch.float),
            label=torch.tensor(label[-3:], dtype=torch.float)
        )

In [158]:
class PriceDataModule(pl.LightningDataModule):
    def __init__(self, train_seqs, test_seqs, batch_size=8):
        super().__init__()
        self.batch_size = batch_size
        self.train_seqs = train_seqs
        self.test_seqs = test_seqs

    def setup(self, stage=None):
        self.train_dataset = PriceDataset(self.train_seqs)
        self.test_dataset = PriceDataset(self.test_seqs)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# Pytorch Model

In [164]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        self.lstm.flatten_parameters() 
        # init hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        # init cell state 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
        # detach hidden state and cell state from the graph to prevent backpropagation
        out, _ = self.lstm(x, (h0.detach(), c0.detach()))
        # keep the last 3 time step of each sequence in the batch to predict the next 3 time steps
        out = self.fc(out[:, -3:, :])
        print(out)
        return out

class LSTMTrainer(pl.LightningModule):
    def __init__(self, input_size, hidden_size, num_layers, output_size, learning_rate):
        super().__init__()
        self.save_hyperparameters()
        self.model = LSTM(input_size, hidden_size, num_layers, output_size)
        self.learning_rate = learning_rate

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        seq, label = batch["seq"], batch["label"]
        # pass the sequence through the model
        out = self.model(seq)
        loss = torch.sqrt(F.mse_loss(out, label))
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        seq, label = batch["seq"], batch["label"]
        out = self.model(seq)
        # loss function RMSE 
        # compute the RMSE of the last time step of each sequence in the batch where the label is the next 3 time steps
        loss = torch.sqrt(F.mse_loss(out, label))
        self.log("val_loss", loss)

    def test_step(self, batch, batch_idx):
        seq, label = batch["seq"], batch["label"]
        out = self.model(seq)
        # loss function RMSE 
        # compute the RMSE of the last time step of each sequence in the batch where the label is the next 3 time steps
        loss = torch.sqrt(F.mse_loss(out, label))
        self.log("test_loss", loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

# Start Training

In [165]:
N_EPOCHS = 8
BATCH_SIZE = 8
INPUT_SIZE = 9
HIDDEN_SIZE = 128
NUM_LAYERS = 2
OUTPUT_SIZE = 3
LEARNING_RATE = 0.001

# initialize data module
data_module = PriceDataModule(train_lstm_seqs[1], test_lstm_seqs[1], batch_size=BATCH_SIZE)
data_module.setup()

train_dataset = PriceDataset(train_lstm_seqs[1])

# sanity check
a = iter(train_dataset)
b = next(a)
print("Sequence Shape: ", b["seq"].shape)
print("Label: {} and Label Shape: {}".format(b["label"], b["label"].shape))

# initialize the model
model = LSTMTrainer(input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, output_size=OUTPUT_SIZE, learning_rate=LEARNING_RATE)

Sequence Shape:  torch.Size([12, 9])
Label: tensor([485000., 368000., 451000.]) and Label Shape: torch.Size([3])


In [163]:
n_features = b["seq"].shape[1]
n_features

9

In [166]:
# initialize the trainer
trainer = pl.Trainer(max_epochs=N_EPOCHS)

# start training
trainer.fit(model, data_module)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
  rank_zero_warn(

  | Name  | Type | Params
-------------------------------
0 | model | LSTM | 203 K 
-------------------------------
203 K     Trainable params
0         Non-trainable params
203 K     Total params
0.815     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

tensor([[[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]],

        [[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]]], grad_fn=<AddBackward0>)


  loss = torch.sqrt(F.mse_loss(out, label))


RuntimeError: The size of tensor a (3) must match the size of tensor b (8) at non-singleton dimension 1