In [485]:
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize
from statsmodels.formula.api import ols
from tabulate import tabulate
from linearmodels import PanelOLS

In [486]:
def load_material_price():
    df = pd.read_excel('data/Construction_material_market_prices(1999-2024).xlsx', 
                      sheet_name='Price')
    
    used_cols = []
    cols = list(df.columns)
    
    for col in cols:
        if 'Data Series' not in col:
            used_cols.append(col)
            
    df = df.melt(id_vars=['Data Series'],
    value_vars=used_cols,
    var_name='Time', value_name='Price')
    
    df['year'] = df['Time'].apply(lambda x: int(x.split()[0]))
    df = df.groupby(['Data Series', 'year']).Price.mean().reset_index()
    df = df.rename(columns={'Data Series': 'material'})
    
    df = pd.pivot_table(df, values = 'Price', index=['year'], columns = 'material').reset_index()
    
    df = df.rename(columns={'Cement In Bulk (Ordinary Portland Cement) (Dollar Per Tonne)': 'cement',
                           'Concreting Sand (Dollar Per Tonne)': 'concreting_sand',
                           'Granite (20mm Aggregate) (Dollar Per Tonne)': 'granite',
                           'Ready Mixed Concrete (Dollar Per Cubic Metre)': 'mixed_concrete',
                           'Steel Reinforcement Bars (16-32mm High Tensile) (Dollar Per Tonne)': 'steel'})
    return df

In [487]:
def load_hdb_resale_price():
    used_columns = ['year','street_name','town', 'post_covid', 'in_covid', 'flat_type', 'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price']
    df1 = pd.read_csv('data/ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016.csv')
    df2 = pd.read_csv('data/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv')
    df = pd.concat([df1, df2])
    df['year'] = df['month'].apply(lambda x: int(x.split('-')[0]))
    df['month'] = df['month'].apply(lambda x: int(x.split('-')[-1]))
    df['flat_type'] = df['flat_type'].apply(lambda x: x.lower())
    df['town'] = df['town'].apply(lambda x: x.lower())
    df['post_covid'] = df.apply(lambda x: 1 if x['year']>=2021 and x['month']>11 else 0, axis=1)
    df['in_covid'] = df.apply(lambda x: 1 if (x['year']>=2020 and x['month']>4) and (x['year']<=2021 and x['month']<11) else 0, axis=1)
    return df[used_columns]

def load_hdb_rental_price():
    hdb_rental_price = pd.read_excel('data/HDB Rental Price (from 2007-3qtr to 2023-3qtr).xlsx')
    hdb_rental_price.columns = hdb_rental_price.iloc[0]
    hdb_rental_price = hdb_rental_price[1:]
    del hdb_rental_price[hdb_rental_price.columns[0]]
      
    hdb_rental_price = hdb_rental_price.melt(
    id_vars=['Town', 'Year '],
    value_vars=['1-Room', '2-Room', '3-Room', '4-Room','5-Room', 'Executive'],
    var_name='Room Type', value_name='Price')
    
    hdb_rental_price = hdb_rental_price[(hdb_rental_price['Price'] != '-') & (hdb_rental_price['Price'] != '*')]
    hdb_rental_price['Room Type'] = hdb_rental_price['Room Type'].apply(lambda x: x.replace('-', ' ').lower())
    hdb_rental_price = hdb_rental_price.rename(columns={'Year ': 'Year'})
    hdb_rental_price['Price'] = hdb_rental_price['Price'].apply(lambda x: int(x.replace('$', '').replace('.','').replace(',','')))
    hdb_rental_price['Town'] = hdb_rental_price['Town'].apply(lambda x: x.lower())
    
    df = hdb_rental_price.groupby(['Town','Year', 'Room Type']).mean()
    df = df.reset_index()
    
    return df[df['Year']>=2015]

def combine_rental_and_resale_price():
#   central defined by MRT station. If it could reach central area within 3 stops, we considered it central area
    central = ['bishan', 'bukit merah', 'bukit timah', 'geylang', 'kallang', 'marine parade', 'queenstown', 'toa payoh']   
    
    resale = load_hdb_resale_price()
    rental = load_hdb_rental_price()
    
    combined_df = resale.merge(rental, left_on=['year', 'town', 'flat_type'], right_on=['Year', 'Town', 'Room Type'])
    
    used_columns=list(resale.columns)
    used_columns.append('rental_price')
    combined_df = combined_df.rename(columns={'Price': 'rental_price'})
    combined_df = combined_df[used_columns]
    
    combined_df['central_marker'] = combined_df['town'].apply(lambda x: 1 if x in central else 0)
    return combined_df

In [488]:
res = ols('resale_price ~ in_covid + post_covid + floor_area_sqm + central_marker + flat_type + lease_commence_date', data=df).fit()
with open('resale_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

res = ols('rental_price ~ in_covid + post_covid + central_marker + flat_type', data=df).fit()
with open('rental_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.614
Model:                            OLS   Adj. R-squared:                  0.614
Method:                 Least Squares   F-statistic:                 3.406e+04
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        01:50:08   Log-Likelihood:            -2.4831e+06
No. Observations:              192523   AIC:                         4.966e+06
Df Residuals:                  192513   BIC:                         4.966e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept              -9.15

In [489]:
res = ols('resale_price ~ in_covid + central_marker + in_covid*central_marker + lease_commence_date + flat_type + floor_area_sqm', data=df).fit()
with open('resale_did_during_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

res = ols('resale_price ~ post_covid + central_marker + post_covid*central_marker + lease_commence_date + flat_type + floor_area_sqm', data=df).fit()
with open('resale_did_post_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

res = ols('rental_price ~ in_covid + central_marker + in_covid*central_marker + flat_type', data=df).fit()
with open('rental_did_during_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

res = ols('rental_price ~ post_covid + central_marker + post_covid*central_marker + flat_type', data=df).fit()
with open('rental_did_post_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.608
Model:                            OLS   Adj. R-squared:                  0.608
Method:                 Least Squares   F-statistic:                 3.321e+04
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        01:50:10   Log-Likelihood:            -2.4846e+06
No. Observations:              192523   AIC:                         4.969e+06
Df Residuals:                  192513   BIC:                         4.969e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept               -9

In [490]:
time_fe_df = df.set_index(['street_name', 'year'])

# Regression
result = PanelOLS(time_fe_df.resale_price, time_fe_df[['in_covid', 'post_covid', 'central_marker', 'lease_commence_date', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False
              ).fit()
with open('resale_time_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:           resale_price   R-squared:                        0.6627
Estimator:                   PanelOLS   R-squared (Between):             -244.63
No. Observations:              192523   R-squared (Within):               0.6654
Date:                Fri, Apr 19 2024   R-squared (Overall):             -254.24
Time:                        01:50:15   Log-likelihood                -2.459e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   4.203e+04
Entities:                         503   P-value                           0.0000
Avg Obs:                       382.75   Distribution:                F(9,192505)
Min Obs:                       1.0000                                           
Max Obs:                       3086.0   F-statistic (robust):          4.203e+04
                            

In [491]:
import pandas as pd
from linearmodels import PanelOLS
import numpy as np

time_fe_df = df.set_index(['street_name', 'year'])
time_fe_df['post_covid_central_marker'] = time_fe_df['post_covid'] * time_fe_df['central_marker']
time_fe_df['in_covid_central_marker'] = time_fe_df['in_covid'] * time_fe_df['central_marker']

# Regression
result = PanelOLS(time_fe_df.resale_price, time_fe_df[['post_covid', 'post_covid_central_marker', 'lease_commence_date', 'central_marker', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False).fit()
with open('resale_did_post_covid_time_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(result)

# Regression
result = PanelOLS(time_fe_df.resale_price, time_fe_df[['in_covid', 'in_covid_central_marker', 'lease_commence_date', 'central_marker', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False).fit()
with open('resale_did_post_covid_time_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(result)

# Regression
result = PanelOLS(time_fe_df.rental_price, time_fe_df[['post_covid', 'post_covid_central_marker', 'lease_commence_date', 'central_marker', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False).fit()
with open('rental_did_post_covid_time_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(result)

# Regression
result = PanelOLS(time_fe_df.rental_price, time_fe_df[['in_covid', 'in_covid_central_marker', 'lease_commence_date', 'central_marker', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False).fit()
with open('rental_did_during_covid_time_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(result)

                          PanelOLS Estimation Summary                           
Dep. Variable:           resale_price   R-squared:                        0.6627
Estimator:                   PanelOLS   R-squared (Between):             -244.55
No. Observations:              192523   R-squared (Within):               0.6664
Date:                Fri, Apr 19 2024   R-squared (Overall):             -254.16
Time:                        01:50:16   Log-likelihood                -2.459e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   4.203e+04
Entities:                         503   P-value                           0.0000
Avg Obs:                       382.75   Distribution:                F(9,192505)
Min Obs:                       1.0000                                           
Max Obs:                       3086.0   F-statistic (robust):          4.203e+04
                            

                          PanelOLS Estimation Summary                           
Dep. Variable:           rental_price   R-squared:                        0.5936
Estimator:                   PanelOLS   R-squared (Between):             -1.3964
No. Observations:              192523   R-squared (Within):               0.1652
Date:                Fri, Apr 19 2024   R-squared (Overall):             -1.4236
Time:                        01:50:21   Log-likelihood                -1.287e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   3.124e+04
Entities:                         503   P-value                           0.0000
Avg Obs:                       382.75   Distribution:                F(9,192505)
Min Obs:                       1.0000                                           
Max Obs:                       3086.0   F-statistic (robust):          3.124e+04
                            