In [12]:
# Importing required libraries
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import yfinance as yf
import pyfredapi as pf
import functions as fun
import importlib as imp
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

# Setting a path to the directory
directory = ''

# Fixing API key for the FRED
from configparser import ConfigParser
config = ConfigParser()
config.read(directory + 'config.cfg')
api_key = config.get('fred', 'api_key')
lags = json.loads(config.get('params', 'lags'))
log = bool(config.get('params', 'log'))
st_sc = bool(config.get('params', 'st_sc'))

In [13]:
imp.reload(fun)

<module 'functions' from 'c:\\Users\\user\\OneDrive\\PD\\Диплом М\\real_estate_price_prediction\\functions.py'>

### Target preparation

In [36]:
# https://haus.com/resources/the-common-haus-price-index  ->  Download full data series

# Importing original dataset
target = pd.read_csv(directory + 'Data/metro_chpi.csv')
target[['City', 'StatArea']] = target['msaname'].str.split(', ', n = 1, expand = True)
target['date'] = pd.to_datetime(target['date'])
target.rename(columns = {'CHPI': 'target'}, inplace = True)

# Creation of statarea dictionary
msas = target.drop_duplicates(['msa', 'msaname'])[['msa', 'msaname', 'City', 'StatArea']]
msas['MinDate'] = msas['msa'].apply(lambda x: target[target['msa'] == x]['date'].min())
msas['MaxDate'] = msas['msa'].apply(lambda x: target[target['msa'] == x]['date'].max())
msas['NumberOfEntries'] = msas['msa'].apply(lambda x: target[target['msa'] == x]['date'].count())
msas.to_parquet(directory + 'Data/statareas.parquet', index = False)

# Creating of target dataset, calculating of the WoW metrics and saving to parquet
target_cl = pd.DataFrame()
for msa in tqdm(msas['msa'].unique()):
    target_cl_msa = target[target['msa'] == msa][['date', 'msa', 'target']]
    target_cl_msa['dyn'] = target_cl_msa['target'] / target_cl_msa['target'].shift(1)
    target_cl_msa['anom'] = target_cl_msa['dyn'].apply(lambda x: 1 if (x < 0.9) or (x > 1.1) else 0)
    if 1 in target_cl_msa['anom']:
        target_cl_msa = target_cl_msa[target_cl_msa['date'] > target_cl_msa[target_cl_msa['anom'] == 1]['date'].max()]
    for lag in lags:
        target_cl_msa[f'target_{lag}_week_logdyn'] = np.log(target_cl_msa['target'] / target_cl_msa['target'].shift(lag))
        target_cl_msa[f'target_{lag}_week_fut'] = target_cl_msa['target'].shift(-lag)
    target_cl = pd.concat([target_cl, target_cl_msa])
target_cl.dropna(thresh = len(target_cl.columns) - len(lags) + 1, inplace = True)
target_cl.drop(columns = ['dyn', 'anom'], inplace = True)
target_cl.to_parquet(directory + 'Data/target.parquet', index = False)
target_cl

100%|██████████| 100/100 [00:00<00:00, 151.57it/s]


Unnamed: 0,date,msa,target,target_4_week_logdyn,target_4_week_fut,target_13_week_logdyn,target_13_week_fut,target_26_week_logdyn,target_26_week_fut
313,2010-02-19,10740,176278.88,-0.000794,185638.86,,186543.42,,173994.03
392,2010-02-26,10740,174968.95,0.004701,177941.73,,185887.05,,172995.00
469,2010-03-05,10740,176013.13,0.010878,177228.06,,185373.48,,172917.64
547,2010-03-12,10740,178143.97,0.009791,179397.91,,184779.03,,172888.78
626,2010-03-19,10740,185638.86,0.051736,181148.92,,199206.63,,172072.75
...,...,...,...,...,...,...,...,...,...
66165,2023-01-27,41700,283514.75,-0.020169,287313.13,-0.007741,,-0.048545,
66265,2023-02-03,41700,283894.34,-0.018878,289456.66,-0.005988,,-0.042751,
66365,2023-02-10,41700,283960.13,-0.018610,289882.09,-0.005581,,-0.035205,
66465,2023-02-17,41700,286668.03,-0.004662,291308.75,0.002661,,-0.025309,


### Macro data preparation

In [33]:
# Define keys that can be imported through the API
ids_FRED = {'KeyRate': 'DFF', 'CPI': 'CPIAUCSL', 'VIX': 'VIXCLS', 'PPI': 'PCU44414441', 'MortgageRate30': 'MORTGAGE30US', 
            'Electricity': 'CUSR0000SEHF01', 'Water': 'CUSR0000SEHG', 'Plywood': 'WPU083', 'Steel': 'WPU101',
            'Glass': 'PCU3272132721', 'Concrete': 'PCU32733273', 'Unemployment': 'UNRATE', 'Yield10Y': 'DGS10',
            'Case-Shiller': 'SPCS20RSA'}
ids_YFINANCE = {'DJI': '^DJI', 'S&P500': '^GSPC'}
FRED_keys = [key for key in ids_FRED]
YFINANCE_keys = [key for key in ids_YFINANCE]

# Import data from FRED API
FRED = pd.DataFrame(columns = ['date'])
for key in FRED_keys:
    FRED_key = pf.get_series(series_id = ids_FRED[key], api_key = api_key)[['date', 'value']]
    FRED_key.rename(columns = {'value': key}, inplace = True)
    FRED = FRED.merge(FRED_key, on = 'date', how = 'outer')
FRED.sort_values('date', ascending = True, inplace = True)
FRED = FRED[FRED['date'] >= pd.to_datetime('2003-01-01')]

# Import data from Yahoo Finance API
YFINANCE = pd.DataFrame(columns = ['date'])
for key in YFINANCE_keys:
    YFINANCE_key = yf.download(tickers = ids_YFINANCE[key], period = "max", interval = "1d", group_by = 'ticker', auto_adjust = True, progress = False, threads = True)
    YFINANCE_key.reset_index(inplace = True)
    YFINANCE_key = YFINANCE_key[['Date', 'Close']]
    YFINANCE_key.rename(columns = {'Date': 'date', 'Close': key}, inplace = True)
    YFINANCE = YFINANCE.merge(YFINANCE_key, on = 'date', how = 'outer')
YFINANCE.sort_values('date', ascending = True, inplace = True)
YFINANCE = YFINANCE[YFINANCE['date'] >= pd.to_datetime('2003-01-01')]

# Merging all dataframes in one, interpolating nans and saving to parquet
macro = FRED.merge(YFINANCE, on = 'date', how = 'outer')
macro[macro.columns[1:]] = macro[macro.columns[1:]].interpolate(limit_direction = 'forward', limit_area = 'inside', axis = 0)
macro.dropna(inplace = True)
macro.sort_values('date', ascending = True, inplace = True)
macro.to_parquet(directory + 'Data/macro.parquet', index = False)
macro

Unnamed: 0,date,KeyRate,CPI,VIX,PPI,MortgageRate30,Electricity,Water,Plywood,Steel,Glass,Concrete,Unemployment,Yield10Y,Case-Shiller,DJI,S&P500
334,2003-12-01,1.03,185.500000,16.7700,100.000000,5.945714,140.500000,120.000000,192.100000,128.400000,100.000000,100.000000,5.7,4.4000,150.758971,9899.049805,1070.119995
335,2003-12-02,0.97,185.525806,16.2700,100.080645,5.964286,140.500000,120.019355,191.529032,128.580645,100.006452,100.029032,5.7,4.3800,150.819168,9853.639648,1066.619995
336,2003-12-03,0.98,185.551613,16.6300,100.161290,5.982857,140.500000,120.038710,190.958065,128.761290,100.012903,100.058065,5.7,4.4100,150.879366,9873.419922,1064.729980
337,2003-12-04,0.99,185.577419,16.3000,100.241935,6.001429,140.500000,120.058065,190.387097,128.941935,100.019355,100.087097,5.7,4.3800,150.939563,9930.820312,1069.719971
338,2003-12-05,0.98,185.603226,17.0900,100.322581,6.020000,140.500000,120.077419,189.816129,129.122581,100.025806,100.116129,5.7,4.2300,150.999760,9862.679688,1061.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7666,2023-12-28,5.33,309.563323,12.4700,205.678129,6.610000,276.524097,296.279161,284.229452,335.777839,168.484032,243.107774,3.7,3.8400,322.164732,37710.101562,4783.350098
7667,2023-12-29,5.33,309.593742,12.4500,205.656097,6.611429,276.631323,296.382871,284.259839,336.142129,168.525774,243.268581,3.7,3.8800,322.179396,37689.539062,4769.830078
7668,2023-12-30,5.33,309.624161,12.6375,205.634065,6.612857,276.738548,296.486581,284.290226,336.506419,168.567516,243.429387,3.7,3.8975,322.194059,37695.914062,4763.080078
7669,2023-12-31,5.33,309.654581,12.8250,205.612032,6.614286,276.845774,296.590290,284.320613,336.870710,168.609258,243.590194,3.7,3.9150,322.208722,37702.289062,4756.330078


### Final datasets creation

In [19]:
# Importing both of the datasets 
target = pd.read_parquet(directory + 'Data/target.parquet')
macro = pd.read_parquet(directory + 'Data/macro.parquet')
target_2008 = macro[['date', 'Case-Shiller']].rename(columns = {'Case-Shiller': 'target'})
target_2008['target'] *= 1000
macro.drop(columns = ['Case-Shiller'], inplace = True)
macro_dyn = pd.DataFrame(target['date'].unique(), columns = ['date']).merge(macro, on = 'date', how = 'inner')

# Filtering weekly dates for 2008 dataset
CS_dates = []
start_date = macro_dyn['date'].max()
for i in range(1, 1500):
    CS_dates.append(start_date - pd.DateOffset(weeks = i))
    CS_dates.append(start_date + pd.DateOffset(weeks = i))
target_2008 = target_2008[target_2008['date'].isin(CS_dates)]
macro_dyn_2008 = target_2008.merge(macro, on = 'date', how = 'inner')

# Calculating log dynamics of the macro data with different lags
cols = macro_dyn.columns[1:].copy()
cols_2008 = macro_dyn_2008.columns[1:].copy()
for lag in lags:
    for col in cols:
        macro_dyn[f'{col}_{lag}_week_logdyn'] = np.log(macro_dyn[col] / macro_dyn[col].shift(lag))
    for col in cols_2008:
        macro_dyn_2008[f'{col}_{lag}_week_logdyn'] = np.log(macro_dyn_2008[col] / macro_dyn_2008[col].shift(lag))
    macro_dyn_2008[f'target_{lag}_week_fut'] = macro_dyn_2008['target'].shift(-lag)
macro_dyn.dropna(thresh = len(macro_dyn.columns) - len(lags) + 1, inplace = True)
macro_dyn_2008.dropna(thresh = len(macro_dyn_2008.columns) - len(lags) + 1, inplace = True)
macro_dyn.sort_values('date', ascending = True, inplace = True)
macro_dyn_2008.sort_values('date', ascending = True, inplace = True)
macro_dyn_2008.set_index('date', drop = True, inplace = True)
macro_dyn_2008.to_parquet(directory + 'Data_for_models/final_CS.parquet')

# Merging dataset with all columns
final_full = target.merge(macro_dyn, on = 'date', how = 'inner')
final_full.sort_values('date', ascending = True, inplace = True)

# Getting additional dataset for EDA
eda_cols = [col for col in final_full.columns if ('target' not in col) & (col != 'msa')]
final_eda = final_full[eda_cols]
final_eda = final_eda[~final_eda['date'].duplicated(keep = 'first')]
final_eda.to_parquet(directory + 'Data_for_models/final_eda.parquet', index = False)

# Adding log to target and st_sc to other variables
final_full.set_index(['date', 'msa'], drop = True, inplace = True)
if log == True:
    log_cols = [col for col in final_full.columns if '_week_fut' in col] + ['target']
    for col in log_cols:
        final_full[col] = np.log(final_full[col])
if st_sc == True:
    st_sc_cols = [col for col in final_full.columns if ('_week_fut' not in col) & (col != 'target')]
    for col in st_sc_cols:
        stsc = StandardScaler()
        final_full[col] = stsc.fit_transform(np.asarray(final_full[col]).reshape(-1,1))

# Saving the final dataset
final_full.to_parquet(directory + 'Data_for_models/final_full.parquet')
final_full

Unnamed: 0_level_0,Unnamed: 1_level_0,target,target_4_week_logdyn,target_4_week_fut,target_13_week_logdyn,target_13_week_fut,target_26_week_logdyn,target_26_week_fut,KeyRate,CPI,VIX,...,Electricity_26_week_logdyn,Water_26_week_logdyn,Plywood_26_week_logdyn,Steel_26_week_logdyn,Glass_26_week_logdyn,Concrete_26_week_logdyn,Unemployment_26_week_logdyn,Yield10Y_26_week_logdyn,DJI_26_week_logdyn,S&P500_26_week_logdyn
date,msa,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010-08-20,10740,12.066776,-0.089359,12.055673,-0.288174,12.035499,-0.115910,11.949718,-0.534456,-1.512318,1.029194,...,-0.274348,1.002680,0.030556,0.101659,-0.991492,-1.520634,0.007936,-1.089438,-0.781337,-0.986385
2010-08-20,45300,11.796558,-0.031438,11.759653,-0.174514,11.812320,-0.272759,11.778194,-0.534456,-1.512318,1.029194,...,-0.274348,1.002680,0.030556,0.101659,-0.991492,-1.520634,0.007936,-1.089438,-0.781337,-0.986385
2010-08-20,45104,12.277917,-0.137031,12.264042,-0.120749,12.216511,-0.143138,12.144896,-0.534456,-1.512318,1.029194,...,-0.274348,1.002680,0.030556,0.101659,-0.991492,-1.520634,0.007936,-1.089438,-0.781337,-0.986385
2010-08-20,43524,12.750246,-0.007326,12.738260,-0.076108,12.826968,0.062541,12.661470,-0.534456,-1.512318,1.029194,...,-0.274348,1.002680,0.030556,0.101659,-0.991492,-1.520634,0.007936,-1.089438,-0.781337,-0.986385
2010-08-20,42644,12.615869,-0.079345,12.600277,-0.568073,12.553511,-0.485546,12.472153,-0.534456,-1.512318,1.029194,...,-0.274348,1.002680,0.030556,0.101659,-0.991492,-1.520634,0.007936,-1.089438,-0.781337,-0.986385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-24,28940,12.659915,-0.002408,12.674565,0.093557,,0.043758,,3.955267,2.656706,0.489101,...,0.481604,1.430438,-1.053942,-0.532474,3.146075,1.862004,0.153935,0.751682,-0.351973,-0.838725
2023-02-24,28140,12.546964,0.063136,12.574778,0.121979,,0.132220,,3.955267,2.656706,0.489101,...,0.481604,1.430438,-1.053942,-0.532474,3.146075,1.862004,0.153935,0.751682,-0.351973,-0.838725
2023-02-24,27260,12.694879,0.008202,12.696138,0.033015,,-0.127806,,3.955267,2.656706,0.489101,...,0.481604,1.430438,-1.053942,-0.532474,3.146075,1.862004,0.153935,0.751682,-0.351973,-0.838725
2023-02-24,33340,12.638028,-0.024842,12.619926,0.072534,,-0.011351,,3.955267,2.656706,0.489101,...,0.481604,1.430438,-1.053942,-0.532474,3.146075,1.862004,0.153935,0.751682,-0.351973,-0.838725


In [18]:
# Update statareas dictionary based on the final data
msas = pd.read_parquet('Data/statareas.parquet')
msas['FinalMinDate'] = msas['msa'].apply(lambda x: final_full[final_full.index.get_level_values('msa') == x].index.get_level_values('date').min())
msas['FinalNumberOfEntries'] = msas['msa'].apply(lambda x: len(final_full[final_full.index.get_level_values('msa') == x]))
msas.to_parquet('Data/statareas.parquet')
msas

Unnamed: 0,msa,msaname,City,StatArea,MinDate,MaxDate,NumberOfEntries,FinalMinDate,FinalNumberOfEntries
0,10740,"Albuquerque, NM",Albuquerque,NM,2010-01-22,2023-03-24,688,2010-08-20,654
1,10900,"Allentown-Bethlehem-Easton, PA-NJ",Allentown-Bethlehem-Easton,PA-NJ,2010-01-22,2023-03-24,688,NaT,0
2,11244,"Anaheim-Santa Ana-Irvine, CA",Anaheim-Santa Ana-Irvine,CA,2010-01-22,2023-03-24,688,2010-08-20,654
3,12420,"Austin-Round Rock, TX",Austin-Round Rock,TX,2010-01-22,2023-03-24,688,2010-08-20,654
4,12580,"Baltimore-Columbia-Towson, MD",Baltimore-Columbia-Towson,MD,2010-01-22,2023-03-24,688,2010-08-20,654
...,...,...,...,...,...,...,...,...,...
95,12940,"Baton Rouge, LA",Baton Rouge,LA,2010-09-17,2023-03-24,654,2010-10-15,646
96,49660,"Youngstown-Warren-Boardman, OH-PA",Youngstown-Warren-Boardman,OH-PA,2010-09-17,2023-03-24,654,2010-10-15,646
97,10420,"Akron, OH",Akron,OH,2011-01-07,2023-03-24,638,2011-02-04,630
98,13820,"Birmingham-Hoover, AL",Birmingham-Hoover,AL,2012-02-10,2023-03-24,581,2012-03-09,573
