## DC House Price

#### Project goal: 

- Predict the median house price in DC
- use the right housing statistics and macro variables: (1) time series, (2) acf/pacf plots, (3) stationary test and (4) autocorrelation test
- Modeling: time series linear regression: feature selection in R-squared, AIC, etc.
- Cross validation: Out-of-time performance
- Residual analysis: (1) normality test, (2) Q-Q plot, (3) MAE, (4) MAPE, (5) Homoscedastic test, (6) stationary test and (7) autocorrelation test 

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# tests for stationary 
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss

import statsmodels.api as sm

#tests for autocorrelation
from statsmodels.stats.diagnostic import acorr_breusch_godfrey
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import acorr_ljungbox

#test of Heteroscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan

#test for cointegration
from statsmodels.tsa.vector_ar.vecm import coint_johansen 

In [56]:
df = pd.read_csv("../data/full_data.csv")

In [57]:
df_test = df.drop(columns=['Time', 'DATE', 'Median Sale Price MoM', 'New Listings MoM', 'Homes Sold MoM', 'Inventory MoM', 'Days on Market MoM', 'Average Sale To List MoM', 'DC_PER'])
col_name = df_test.columns

In [58]:
# create a log transformed variable
for i in range(len(col_name)):
    df[col_name[i] + '_log'] =  np.log(df[col_name[i]])

In [59]:
df_test = df.drop(columns=['Time', 'DATE'])
col_name = df_test.columns
col_name

Index(['Median Sale Price', 'Median Sale Price MoM', 'Homes Sold',
       'Homes Sold MoM', 'New Listings', 'New Listings MoM', 'Inventory',
       'Inventory MoM', 'Days on Market', 'Days on Market MoM',
       'Average Sale To List%', 'Average Sale To List MoM', 'WDXRSA', 'US_UR',
       'DMV_UR', 'DC_UR', 'FED_EM', 'FIN_EM', 'HOS_EM', 'DC_PER', 'DMV_PER',
       'DMV_PER_1', 'Median Sale Price_log', 'Homes Sold_log',
       'New Listings_log', 'Inventory_log', 'Days on Market_log',
       'Average Sale To List%_log', 'WDXRSA_log', 'US_UR_log', 'DMV_UR_log',
       'DC_UR_log', 'FED_EM_log', 'FIN_EM_log', 'HOS_EM_log', 'DMV_PER_log',
       'DMV_PER_1_log'],
      dtype='object')

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [43]:
adfuller(df[col_name[36]], regression = 'ct')

(-2.3698560420501322,
 0.3957621546615447,
 5,
 111,
 {'1%': -4.042734832692444,
  '5%': -3.4508097122615675,
  '10%': -3.1506786195618557},
 28.131971163658108)

#### Only focus on KPSS and DW test

In [61]:
#statistic, p-value for adfuller
p_val_adf = []

# statistic and p-value for kpss
p_val_kpss = []

# statistic and p-value for acorr_breusch_godfrey 
p_val_bg = []

# statistic and p-value for durbin_watson
p_val_dw = []

for i in range(len(col_name)):
    temp = adfuller(df[col_name[i]], regression = 'ct')
    p_val_adf.append(temp[1])
    
    temp = kpss(df[col_name[i]], regression = 'ct')
    p_val_kpss.append(temp[1])
    
    temp = durbin_watson(df[col_name[i]])
    p_val_dw.append(temp)

test_matrix = pd.DataFrame(list(zip(col_name, p_val_adf, p_val_kpss, p_val_dw)), 
               columns =['Features','p_adftest', 'p_kpsstest','p_dwtest'])
    



In [62]:
test_matrix

Unnamed: 0,Features,p_adftest,p_kpsstest,p_dwtest
0,Median Sale Price,0.001852913,0.048158,0.001460217
1,Median Sale Price MoM,4.77449e-07,0.1,2.351027
2,Homes Sold,0.9371101,0.1,0.02136036
3,Homes Sold MoM,0.07690392,0.1,2.040086
4,New Listings,0.9164763,0.1,0.04232778
5,New Listings MoM,0.4836519,0.1,1.955755
6,Inventory,0.01227744,0.085247,0.00510529
7,Inventory MoM,0.9186213,0.1,1.057099
8,Days on Market,0.04751578,0.1,0.04371311
9,Days on Market MoM,0.2668512,0.1,1.471667


In [63]:
# Create a function to detect the hypothesis testing results. 1: reject null, 0: fail to reject null
def test_p_value(p = .05, name = 'ttest'):
    name_list = []
    for i in range(len(test_matrix)):
        if test_matrix["p_" + name][i] < p:
            name_list.append(1) 
        else:
            name_list.append(0)   
    test_matrix['index_' + name] = name_list

In [64]:
test_p_value(p = .05, name = 'adftest')
test_p_value(p = .05, name = 'kpsstest')
test_p_value(p = .05, name = 'dwtest')
test_matrix

Unnamed: 0,Features,p_adftest,p_kpsstest,p_dwtest,index_adftest,index_kpsstest,index_dwtest
0,Median Sale Price,0.001852913,0.048158,0.001460217,1,1,1
1,Median Sale Price MoM,4.77449e-07,0.1,2.351027,1,0,0
2,Homes Sold,0.9371101,0.1,0.02136036,0,0,1
3,Homes Sold MoM,0.07690392,0.1,2.040086,0,0,0
4,New Listings,0.9164763,0.1,0.04232778,0,0,1
5,New Listings MoM,0.4836519,0.1,1.955755,0,0,0
6,Inventory,0.01227744,0.085247,0.00510529,1,0,1
7,Inventory MoM,0.9186213,0.1,1.057099,0,0,0
8,Days on Market,0.04751578,0.1,0.04371311,1,0,1
9,Days on Market MoM,0.2668512,0.1,1.471667,0,0,0


In [None]:
#ACF plot

sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40)
sm.graphics.tsa.plot_pacf(dta.values.squeeze(), lags=40)