# Backtesting

In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

In [2]:
top_features = ['median_days_to_close_yoy', 'median_days_to_close', 'median_sale_ppsf',
                'median_new_listing_ppsf_yoy', 'ZHVI', 'average_sale_to_list_ratio_yoy',
                'active_listings_yoy', 'age_of_inventory', 'active_listings', 'ZORI',
                'median_sale_ppsf_yoy', 'age_of_inventory_yoy', 'average_sale_to_list_ratio', 'off_market_in_two_weeks',
                'average_pending_sales_listing_updates']

### Data Preparation

In [3]:
# load dataset
df = pd.read_csv('../data/final_data.csv')
df = df.drop(columns=['Unnamed: 0'])
# fix the typo in 3monhth_treasury_yield
df = df.rename(columns={'3monhth_treasury_yield': '3month_treasury_yield'})
# sort by date
df = df.sort_values(by='date')
df

Unnamed: 0,region_name,adjusted_average_new_listings,adjusted_average_new_listings_yoy,average_pending_sales_listing_updates,average_pending_sales_listing_updates_yoy,off_market_in_two_weeks,off_market_in_two_weeks_yoy,adjusted_average_homes_sold,adjusted_average_homes_sold_yoy,median_new_listing_price,...,AVB_adj_close_1month,AVB_adj_close_3month,INVH_adj_close,INVH_adj_close_1week,INVH_adj_close_1month,INVH_adj_close_3month,unemployment,fed_funds,3month_treasury_yield,cpi
501,Bernalillo County,232.0,0.154229,341.0,1.092025,51.0,0.040816,135.0,0.097561,225000.0,...,137.0456,149.7439,17.5995,17.7084,17.9177,17.8088,0.046,0.000104,0.0066,2.43603
3796,Multnomah County,212.0,-0.148594,279.0,-0.010638,133.0,-0.119205,193.0,0.206250,395000.0,...,137.0456,149.7439,17.5995,17.7084,17.9177,17.8088,0.046,0.000104,0.0066,2.43603
3795,Butler County,110.0,0.000000,121.0,-0.047244,5.0,4.000000,55.0,-0.179104,171650.0,...,137.0456,149.7439,17.5995,17.7084,17.9177,17.8088,0.046,0.000104,0.0066,2.43603
3781,Napa County,37.0,0.156250,21.0,-0.275862,1.0,-0.666667,23.0,-0.206897,689000.0,...,137.0456,149.7439,17.5995,17.7084,17.9177,17.8088,0.046,0.000104,0.0066,2.43603
1079,Hood County,25.0,-0.285714,35.0,0.666667,6.0,-0.142857,20.0,0.818182,232900.0,...,137.0456,149.7439,17.5995,17.7084,17.9177,17.8088,0.046,0.000104,0.0066,2.43603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109773,Yuma County,41.0,-0.127660,46.0,0.277778,5.0,-0.375000,27.0,0.421053,359777.0,...,219.6200,226.5500,36.7263,35.7540,33.7500,33.4900,0.041,0.000950,0.0533,3.15301
109772,King County,704.0,0.211704,621.0,0.205825,338.0,0.149660,457.0,0.011062,882475.0,...,219.6200,226.5500,36.7263,35.7540,33.7500,33.4900,0.041,0.000950,0.0533,3.15301
109477,Thomas County,12.0,-0.076923,10.0,-0.166667,3.0,0.000000,8.0,-0.111111,262450.0,...,219.6200,226.5500,36.7263,35.7540,33.7500,33.4900,0.041,0.000950,0.0533,3.15301
109475,Litchfield County,56.0,-0.111111,78.0,0.322034,4.0,-0.428571,49.0,0.020833,399900.0,...,219.6200,226.5500,36.7263,35.7540,33.7500,33.4900,0.041,0.000950,0.0533,3.15301


### Drop Technical Indicators

In [4]:
# Drop technical indicators
df = df.drop(columns=['SPY_adj_close', 'unemployment', 'fed_funds', '3month_treasury_yield', 'cpi'])

### Regional Filter

In [5]:
X = df.drop(columns=['EQR_adj_close_3month', 'ESS_adj_close_3month', 'AVB_adj_close_3month', 'INVH_adj_close_3month',
                     'EQR_adj_close_1month',
                     'ESS_adj_close_1month', 'AVB_adj_close_1month', 'INVH_adj_close_1month'])

In [6]:
# counties for each REIT
X_AVB = X[
    (X['State'] == 'CA') | (X['State'] == 'CO') | (X['State'] == 'CT') | (X['State'] == 'DC') | (
            X['State'] == 'FL') | (X['State'] == 'MD') | (X['State'] == 'MA') | (
            X['State'] == 'NJ') | (
            X['State'] == 'NY') | (X['State'] == 'NC') | (X['State'] == 'TX') | (
            X['State'] == 'VA') | (
            X['State'] == 'WA')]
X_EQR = X[
    (X['State'] == 'CA') | (X['State'] == 'CO') | (X['State'] == 'DC') | (X['State'] == 'MA') | (
            X['State'] == 'NY') | (X['State'] == 'WA')]
X_ESS = X[(X['State'] == 'CA') | (X['State'] == 'WA')]
X_INVH = X[
    (X['State'] == 'CA') | (X['State'] == 'CO') | (X['State'] == 'TX') | (X['State'] == 'NC') | (
            X['State'] == 'SC') | (X['State'] == 'GA') | (X['State'] == 'IL') | (
            X['State'] == 'FL') | (X['State'] == 'NV') | (X['State'] == 'MN') | (
            X['State'] == 'TN') | (X['State'] == 'AZ') | (X['State'] == 'WA')]

X_AVB = X_AVB.drop(columns=['State'])
X_EQR = X_EQR.drop(columns=['State'])
X_ESS = X_ESS.drop(columns=['State'])
X_INVH = X_INVH.drop(columns=['State'])

### Dummy Encoding

In [7]:
# one hot encoding
X_AVB = pd.get_dummies(X_AVB, columns=['region_name'], drop_first=True, dtype=int)
X_EQR = pd.get_dummies(X_EQR, columns=['region_name'], drop_first=True, dtype=int)
X_ESS = pd.get_dummies(X_ESS, columns=['region_name'], drop_first=True, dtype=int)
X_INVH = pd.get_dummies(X_INVH, columns=['region_name'], drop_first=True, dtype=int)

### Find Backtesting Period

In [8]:
# filter backtesting period of reits (last 20% of data)
_, X_test_AVB = train_test_split(X_AVB, test_size=0.2, shuffle=False)
_, X_test_EQR = train_test_split(X_EQR, test_size=0.2, shuffle=False)
_, X_test_ESS = train_test_split(X_ESS, test_size=0.2, shuffle=False)
_, X_test_INVH = train_test_split(X_INVH, test_size=0.2, shuffle=False)

In [9]:
# find backtesting dates column sorted in ascending order
backtest_dates = X_test_AVB['date'].unique()
# remove first and last date
backtest_dates = backtest_dates[1:-1]
# print backtesting period
print(f"Backtesting period: {backtest_dates[0]} to {backtest_dates[-1]}")
print(len(backtest_dates))

Backtesting period: 2023-09-22 to 2024-09-06
50


In [10]:
# prepare output dataframe
output = pd.DataFrame(backtest_dates, columns=['date'])

## Backtesting

### Load Models

In [11]:
# load models
models = {}
with open(f'../models/AVB_ab.pkl', 'rb') as f:
    models['AVB'] = pickle.load(f)
with open(f'../models/EQR_ada.pkl', 'rb') as f:
    models['EQR'] = pickle.load(f)
with open(f'../models/ESS_rf.pkl', 'rb') as f:
    models['ESS'] = pickle.load(f)
with open(f'../models/INVH_xgb.pkl', 'rb') as f:
    models['INVH'] = pickle.load(f)

In [12]:
# Predict and aggregate for each REIT
for reit in models.keys():
    model = models[reit]
    print(f"Predicting {reit}...")

    # filter counties for the REIT
    X_test_reit = X_test_AVB if reit == 'AVB' else X_test_EQR if reit == 'EQR' else X_test_ESS if reit == 'ESS' else X_test_INVH if reit == 'INVH' else None

    # Predict for each county
    X_test_reit[f"{reit}_pred"] = model.predict(X_test_reit.drop(
        columns=['date', 'EQR_adj_close', 'ESS_adj_close', 'AVB_adj_close', 'INVH_adj_close', 'EQR_adj_close_1week',
                 'ESS_adj_close_1week', 'AVB_adj_close_1week', 'INVH_adj_close_1week']))

    # Weighted average of predictions by 'active_listings'
    weekly_predictions = (
        X_test_reit.groupby("date").apply(
            lambda x: np.average(x[f"{reit}_pred"], weights=x["active_listings"]),
            include_groups=False
        )
    )

    # Convert the Series to a DataFrame and give it a name
    weekly_predictions = weekly_predictions.reset_index()
    weekly_predictions.columns = ['date', f'{reit}_pred']

    # Add actual closing prices for comparison
    output = pd.merge(
        output,
        X_test_reit.groupby('date').mean()[f'{reit}_adj_close'],
        on='date',
        how='left'
    )

    # Add actual closing prices for comparison
    output = pd.merge(
        output,
        X_test_reit.groupby('date').mean()[f'{reit}_adj_close_1week'],
        on='date',
        how='left'
    )

    # Add actual closing prices for comparison
    '''output = pd.merge(
        output,
        X_test_reit.groupby('date').mean()[f'{reit}_adj_close_3month'],
        on='date',
        how='left'
    )'''

    # Merge with the date DataFrame
    output = pd.merge(
        output,
        weekly_predictions,
        on='date',
        how='left'
    )

Predicting AVB...
Predicting EQR...
Predicting ESS...
Predicting INVH...


In [13]:
# Save output to CSV
output.to_csv("../data/REIT_predictions.csv", index=False)

print("Predictions saved to 'REIT_predictions.csv'")

Predictions saved to 'REIT_predictions.csv'
