# Predicting the market trend using LSTM
In this notebook, I will use LSTM to predict the daily average selling price for different properties in Boston. I will compare the performance of the LSTM to the baseline model, which is to always use the previous daily average to predict the next daily average.

In [99]:
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import sys
sys.path.append('../functions/')

from lstm_functions import data_splitting,data_scaler,data_reshaping,create_dataset,\
    build_lstm
from sklearn.metrics import r2_score,mean_squared_error
import matplotlib.pyplot as plt


property_type = 'townhouse'

df_features = pd.read_csv('../data/processed/' + 'Boston_%s_feature_matrix.csv'%property_type,index_col=0)
df_features = df_features.sort_values('SOLD DATE')

feature_names = df_features.columns
print(feature_names)

df_features['$/SQUARE FEET'] = df_features['SOLD PRICE']/df_features['SQUARE FEET']

Index(['convenience', 'supermarket', 'park', 'school', 'station',
       'stop_position', 'LIST PRICE', 'SQUARE FEET', 'YEAR BUILT', 'BEDS',
       'BATHS', 'LOT SIZE', 'HOA/MONTH', 'EST $ TREND', 'REMARKS_1',
       'REMARKS_2', 'REMARKS_3', 'REMARKS_4', 'REMARKS_5', 'REMARKS_6',
       'REMARKS_7', 'REMARKS_8', 'REMARKS_9', 'REMARKS_10', 'SOLD PRICE',
       'SOLD DATE'],
      dtype='object')


In [100]:
N = len(df)
train_size = int(0.6 * N)
test_size = N - train_size

print(df_features[['SOLD DATE','$/SQUARE FEET']])
df = df_features[['SOLD DATE','$/SQUARE FEET']].groupby('SOLD DATE').mean()


train, test = data_splitting(df['$/SQUARE FEET'], test_size=0.4)
train_index = train.index
test_index = test.index

train = train.values.reshape(-1,1)
test = test.values.reshape(-1,1)

scaler = data_scaler(train)
train = scaler.transform(train)
test = scaler.transform(test)

look_back = 5
Xtrain,Ytrain = create_dataset(train,look_back=look_back)
Xtest,Ytest = create_dataset(test,look_back=look_back)

Xtrain = data_reshaping(Xtrain,extend_axis='timestep')
Xtest = data_reshaping(Xtest,extend_axis='timestep')

lstm = build_lstm(10, look_back, 1)
lstm.fit(Xtrain, Ytrain, epochs=20, batch_size=1, verbose=2)

Ytrain_pred = lstm.predict(Xtrain)
Ytest_pred = lstm.predict(Xtest)

Ytrain_pred = scaler.inverse_transform(Ytrain_pred.reshape(-1, 1)).reshape(-1,)
Ytest_pred = scaler.inverse_transform(Ytest_pred.reshape(-1, 1)).reshape(-1,)
Ytrain = scaler.inverse_transform(Ytrain.reshape(-1, 1)).reshape(-1,)
Ytest = scaler.inverse_transform(Ytest.reshape(-1, 1)).reshape(-1,)

       SOLD DATE  $/SQUARE FEET
4793  2017-01-20     660.858816
6665  2017-01-31     459.183673
9092  2017-01-31     432.002384
6314  2017-02-01     490.463215
7148  2017-02-07     277.737226
...          ...            ...
4623  2020-01-03     391.280045
4584  2020-01-03     371.075167
4545  2020-01-10     346.607670
4300  2020-01-16     370.707779
4316  2020-01-17     176.402545

[545 rows x 2 columns]
Epoch 1/20
 - 2s - loss: 0.0632
Epoch 2/20
 - 1s - loss: 0.0557
Epoch 3/20
 - 1s - loss: 0.0543
Epoch 4/20
 - 1s - loss: 0.0542
Epoch 5/20
 - 1s - loss: 0.0533
Epoch 6/20
 - 1s - loss: 0.0502
Epoch 7/20
 - 1s - loss: 0.0517
Epoch 8/20
 - 1s - loss: 0.0513
Epoch 9/20
 - 1s - loss: 0.0517
Epoch 10/20
 - 1s - loss: 0.0512
Epoch 11/20
 - 1s - loss: 0.0503
Epoch 12/20
 - 1s - loss: 0.0505
Epoch 13/20
 - 1s - loss: 0.0512
Epoch 14/20
 - 1s - loss: 0.0506
Epoch 15/20
 - 1s - loss: 0.0507
Epoch 16/20
 - 1s - loss: 0.0506
Epoch 17/20
 - 1s - loss: 0.0503
Epoch 18/20
 - 1s - loss: 0.0506
Epoch 1

In [101]:
def predict_daily_price(df_est_price,df_property):

    dict_date_price = {df_est_price.index[i]:
                  price for i,price in enumerate((df_est_price))}

    df_est_price = df_property['SOLD DATE'].apply(
        lambda x: dict_date_price[x] if x in dict_date_price else None)
    df_est_price.name = 'EST $/SQUARE FEET'

    df_joint = df_property[['SOLD PRICE','SQUARE FEET']].join(df_est_price,how='inner')
    df_joint['EST PRICE'] = df_joint['SQUARE FEET']*df_joint['EST $/SQUARE FEET']

    df_joint = df_joint.dropna()

    return df_joint['EST PRICE'].values, df_joint['SOLD PRICE'].values


new_test_index = test_index[look_back+1:]

df_Ytest = pd.DataFrame(Ytest,new_test_index,columns=['EST $/SQUARE FEET'])
df_Ypred = pd.DataFrame(Ytest_pred,new_test_index,columns=['EST $/SQUARE FEET'])

Ytest_lstm,Ytest_pred_lstm = predict_daily_price(df_Ypred['EST $/SQUARE FEET'],df_features)

print('LSTM')
print('r2 score: ',r2_score(Ytest_pred_lstm,Ytest_lstm))
print('MSE: ',mean_squared_error(Ytest_pred_lstm,Ytest_lstm))
print('MAE: ',np.mean(np.abs(Ytest_pred_lstm-Ytest_lstm)/Ytest_lstm))

Ytest_pred_naive = df_features['SOLD PRICE'][:-1].values
Ytest_naive = df_features['SOLD PRICE'][1:].values

print('Baseline')
print('r2 score: ',r2_score(Ytest_naive,Ytest_pred_naive))
print('MSE: ',mean_squared_error(Ytest_naive,Ytest_pred_naive))
print('MAE: ',np.mean(np.abs(Ytest_naive-Ytest_pred_naive)/Ytest_naive))

LSTM
r2 score:  -187.7392507824864
MSE:  16288600755080.67
MAE:  0.4004682970186796
Baseline
r2 score:  -0.9726182026257582
MSE:  161402049723.86765
MAE:  0.5079202636275907


The MAE for condo is 0.45 (baseline) and 0.29 (LSTM).

The MAE for single family residential is 0.36 (baseline) and 0.22 (LSTM).

The MAE for townhouse is 0.51 (baseline) and 0.40 (LSTM).

Despite a significant lift compared to baseline, LSTM predictions for all property types have a negative r2 score.