In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import requests
import time

In [3]:
import csv
with open('../indice/ura_data_processed.csv', mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)

dataset = pd.read_csv('../indice/ura_data_processed.csv', index_col=0)

In [4]:
bool_cols = dataset.select_dtypes(include='bool').columns
object_cols = dataset.select_dtypes(include='object').columns
dataset[bool_cols] = dataset[bool_cols].astype(int)
dataset.dtypes
object_cols

Index([], dtype='object')

In [5]:
if __name__ == '__main__':
    y = dataset['log_price_psf']
    X_columns = [c for c in dataset.columns if not c.startswith('Period_') and 
                 not c in ['log_price_psf']]
    X = dataset[X_columns]
    X = sm.add_constant(X)

    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())

                            OLS Regression Results                            
Dep. Variable:          log_price_psf   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.721
Method:                 Least Squares   F-statistic:                     7715.
Date:                Tue, 26 Mar 2024   Prob (F-statistic):               0.00
Time:                        16:44:04   Log-Likelihood:                 31412.
No. Observations:               92622   AIC:                        -6.276e+04
Df Residuals:                   92590   BIC:                        -6.246e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 6.7135      0.01

In [6]:
#hedonic regression per period
period_list = ['%sQ%s' % (year, qtr) for year in range(2016, 2022) for qtr in range(1, 5)]
print(period_list)
period_list = period_list[:-3]
print(period_list)

['2016Q1', '2016Q2', '2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4', '2021Q1', '2021Q2', '2021Q3', '2021Q4']
['2016Q1', '2016Q2', '2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4', '2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1', '2019Q2', '2019Q3', '2019Q4', '2020Q1', '2020Q2', '2020Q3', '2020Q4', '2021Q1']


In [26]:
results_list = []
for period in period_list:
    period_filter = dataset['Period_%s' % period] == 1
    y_target = y[period_filter]
    X_target = dataset[period_filter]
    X_columns = [c for c in dataset.columns if not c.startswith('Period_') and not c in ['log_price_psf']]
    X_target = X_target[X_columns]
    X_target = sm.add_constant(X_target)

    model = sm.OLS(y_target, X_target)
    results = model.fit()

    result_series = results.params
    pvalue_series = results.pvalues
    pvalue_series.index = ['pvalue_%s' % idx for idx in pvalue_series.index]

    # Instead of appending, concatenate the series and additional information
    additional_data = pd.Series(
        [results.rsquared, results.rsquared_adj, results.nobs],
        index=['rsquared', 'rsquared_adj', 'nobs']
    )
    result_series = pd.concat([result_series, pvalue_series, additional_data])
    results_list.append(result_series)
    results.save("%s.pkl" % period)   

In [30]:
print(pd.__version__)
if pd.__version__ == '2.2.1':
    print(results_list)

2.2.1
[const                        6.605541e+00
log_area_sqft               -1.642629e-01
Type_Condominium             5.920770e-02
Building_age                -1.279582e-03
Relative_tenure              1.913557e+00
                                ...      
pvalue_SaleType_Resale      2.954825e-124
pvalue_SaleType_Sub Sale     2.699910e-03
rsquared                     8.427857e-01
rsquared_adj                 8.400477e-01
nobs                         1.812000e+03
Length: 67, dtype: float64, const                        6.649431e+00
log_area_sqft               -1.672939e-01
Type_Condominium             7.904737e-02
Building_age                -1.423064e-03
Relative_tenure              1.867723e+00
                                ...      
pvalue_SaleType_Resale      2.878147e-173
pvalue_SaleType_Sub Sale     2.227536e-07
rsquared                     7.842218e-01
rsquared_adj                 7.825795e-01
nobs                         4.105000e+03
Length: 67, dtype: float64, const        

In [31]:
results_df = pd.concat(results_list, axis=1)
results_df = results_df.astype(object)

In [32]:
results_df.columns = period_list
results_df.to_csv('per_period.csv')
print('>>> SAVED TO CSV FORMATTING')

>>> SAVED TO CSV FORMATTING
