In [505]:
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize
from statsmodels.formula.api import ols
from tabulate import tabulate
from linearmodels import PanelOLS

In [506]:
def load_material_price():
    df = pd.read_excel('data/Construction_material_market_prices(1999-2024).xlsx', 
                      sheet_name='Price')
    
    used_cols = []
    cols = list(df.columns)
    
    for col in cols:
        if 'Data Series' not in col:
            used_cols.append(col)
            
    df = df.melt(id_vars=['Data Series'],
    value_vars=used_cols,
    var_name='Time', value_name='Price')
    
    df['year'] = df['Time'].apply(lambda x: int(x.split()[0]))
    df = df.groupby(['Data Series', 'year']).Price.mean().reset_index()
    df = df.rename(columns={'Data Series': 'material'})
    
    df = pd.pivot_table(df, values = 'Price', index=['year'], columns = 'material').reset_index()
    
    df = df.rename(columns={'Cement In Bulk (Ordinary Portland Cement) (Dollar Per Tonne)': 'cement',
                           'Concreting Sand (Dollar Per Tonne)': 'concreting_sand',
                           'Granite (20mm Aggregate) (Dollar Per Tonne)': 'granite',
                           'Ready Mixed Concrete (Dollar Per Cubic Metre)': 'mixed_concrete',
                           'Steel Reinforcement Bars (16-32mm High Tensile) (Dollar Per Tonne)': 'steel'})
    return df

In [523]:
def load_hdb_resale_price():
    used_columns = ['year','street_name','town', 'post_covid', 'in_covid', 'flat_type', 'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price']
    df1 = pd.read_csv('data/ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016.csv')
    df2 = pd.read_csv('data/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv')
    df = pd.concat([df1, df2])
    df['year'] = df['month'].apply(lambda x: int(x.split('-')[0]))
    df['month'] = df['month'].apply(lambda x: int(x.split('-')[-1]))
    df['flat_type'] = df['flat_type'].apply(lambda x: x.lower())
    df['town'] = df['town'].apply(lambda x: x.lower())
    df['post_covid'] = df.apply(lambda x: 1 if (x['year']>=2021 and x['month']>=8) or (x['year']>=2022) else 0, axis=1)
    df['in_covid'] = df.apply(lambda x: 1 if (x['year']>=2020 and x['month']>=4) and (x['year']<=2021 and x['month']<8) else 0, axis=1)
    return df[used_columns]

def load_hdb_rental_price():
    hdb_rental_price = pd.read_excel('data/HDB Rental Price (from 2007-3qtr to 2023-3qtr).xlsx')
    hdb_rental_price.columns = hdb_rental_price.iloc[0]
    hdb_rental_price = hdb_rental_price[1:]
    del hdb_rental_price[hdb_rental_price.columns[0]]
      
    hdb_rental_price = hdb_rental_price.melt(
    id_vars=['Town', 'Year '],
    value_vars=['1-Room', '2-Room', '3-Room', '4-Room','5-Room', 'Executive'],
    var_name='Room Type', value_name='Price')
    
    hdb_rental_price = hdb_rental_price[(hdb_rental_price['Price'] != '-') & (hdb_rental_price['Price'] != '*')]
    hdb_rental_price['Room Type'] = hdb_rental_price['Room Type'].apply(lambda x: x.replace('-', ' ').lower())
    hdb_rental_price = hdb_rental_price.rename(columns={'Year ': 'Year'})
    hdb_rental_price['Price'] = hdb_rental_price['Price'].apply(lambda x: int(x.replace('$', '').replace('.','').replace(',','')))
    hdb_rental_price['Town'] = hdb_rental_price['Town'].apply(lambda x: x.lower())
    
    df = hdb_rental_price.groupby(['Town','Year', 'Room Type']).mean()
    df = df.reset_index()
    
    return df[df['Year']>=2015]

def combine_rental_and_resale_price():
#   central defined by MRT station. If it could reach central area within 3 stops, we considered it central area
    central = ['bishan', 'bukit merah', 'bukit timah', 'geylang', 'kallang', 'marine parade', 'queenstown', 'toa payoh']   
    
    resale = load_hdb_resale_price()
    rental = load_hdb_rental_price()
    
    combined_df = resale.merge(rental, left_on=['year', 'town', 'flat_type'], right_on=['Year', 'Town', 'Room Type'])
    
    used_columns=list(resale.columns)
    used_columns.append('rental_price')
    combined_df = combined_df.rename(columns={'Price': 'rental_price'})
    combined_df = combined_df[used_columns]
    
    combined_df['central_marker'] = combined_df['town'].apply(lambda x: 1 if x in central else 0)
    return combined_df

In [524]:
hdb_resale_price = load_hdb_resale_price()
hdb_rental_price = load_hdb_rental_price()
hdb_rental_price = hdb_rental_price.rename(columns={'Price': 'rental_price', 'Town': 'town', 'Year': 'year', 'Room Type': 'flat_type'})

material_price = load_material_price()
df = combine_rental_and_resale_price()

# combine with material price
df = df.merge(material_price, left_on=['year'], right_on=['year'])

In [527]:
df

Unnamed: 0,year,street_name,town,post_covid,in_covid,flat_type,floor_area_sqm,flat_model,lease_commence_date,resale_price,rental_price,central_marker,cement,concreting_sand,granite,mixed_concrete,steel
0,2015,ANG MO KIO AVE 4,ang mo kio,0,0,3 room,60.0,Improved,1986,255000.0,1925.0,0,92.966667,22.683333,19.708333,99.466667,501.400000
1,2015,ANG MO KIO AVE 10,ang mo kio,0,0,3 room,68.0,New Generation,1981,275000.0,1925.0,0,92.966667,22.683333,19.708333,99.466667,501.400000
2,2015,ANG MO KIO AVE 4,ang mo kio,0,0,3 room,69.0,New Generation,1980,285000.0,1925.0,0,92.966667,22.683333,19.708333,99.466667,501.400000
3,2015,ANG MO KIO AVE 10,ang mo kio,0,0,3 room,68.0,New Generation,1979,290000.0,1925.0,0,92.966667,22.683333,19.708333,99.466667,501.400000
4,2015,ANG MO KIO AVE 10,ang mo kio,0,0,3 room,68.0,New Generation,1980,290000.0,1925.0,0,92.966667,22.683333,19.708333,99.466667,501.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192518,2023,YISHUN ST 61,yishun,1,0,executive,142.0,Apartment,1987,788888.0,3225.0,0,115.583333,28.983333,18.983333,116.758333,879.616667
192519,2023,YISHUN ST 61,yishun,1,0,executive,146.0,Maisonette,1987,838000.0,3225.0,0,115.583333,28.983333,18.983333,116.758333,879.616667
192520,2023,YISHUN ST 61,yishun,1,0,executive,142.0,Apartment,1987,755000.0,3225.0,0,115.583333,28.983333,18.983333,116.758333,879.616667
192521,2023,YISHUN ST 81,yishun,1,0,executive,142.0,Apartment,1988,780000.0,3225.0,0,115.583333,28.983333,18.983333,116.758333,879.616667


In [525]:
# Resale price during COVID with DID
tmp_df = df[df['post_covid']==0]

res = ols('resale_price ~ in_covid + central_marker + in_covid*central_marker + lease_commence_date + flat_type + floor_area_sqm', data=tmp_df).fit()
with open('resale_did_during_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

# Apply time FE
time_fe_df = tmp_df.set_index(['street_name', 'year'])
time_fe_df['in_covid_central_marker'] = time_fe_df['in_covid'] * time_fe_df['central_marker']

result = PanelOLS(time_fe_df.resale_price, time_fe_df[['in_covid', 'in_covid_central_marker', 'central_marker', 'lease_commence_date', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False
              ).fit()
with open('resale_did_during_covid_time_fe.txt', 'w') as fh:
    fh.write(result.summary.as_latex())
print(result)


# Rental price during COVID with DID
res = ols('rental_price ~ in_covid + central_marker + in_covid*central_marker + flat_type', data=tmp_df).fit()
with open('rental_did_during_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

# Apply time FE
result = PanelOLS(time_fe_df.rental_price, time_fe_df[['in_covid', 'in_covid_central_marker', 'central_marker', 'flat_type']],
              time_effects=True,
              entity_effects=False
              ).fit()
with open('rental_did_during_covid_time_fe.txt', 'w') as fh:
    fh.write(result.summary.as_latex())
print(result)

                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.645
Model:                            OLS   Adj. R-squared:                  0.645
Method:                 Least Squares   F-statistic:                 2.668e+04
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        17:09:37   Log-Likelihood:            -1.6840e+06
No. Observations:              131913   AIC:                         3.368e+06
Df Residuals:                  131903   BIC:                         3.368e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept               -7

In [526]:
# Resale price post COVID with DID
res = ols('resale_price ~ post_covid + central_marker + post_covid*central_marker + lease_commence_date + flat_type + floor_area_sqm', data=df).fit()
with open('resale_did_post_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

# Apply time FE
time_fe_df = df.set_index(['street_name', 'year'])
time_fe_df['post_covid_central_marker'] = time_fe_df['post_covid'] * time_fe_df['central_marker']

result = PanelOLS(time_fe_df.resale_price, time_fe_df[['post_covid', 'post_covid_central_marker', 'central_marker', 'lease_commence_date', 'flat_type', 'floor_area_sqm']],
              time_effects=True,
              entity_effects=False
              ).fit()
with open('resale_did_post_covid_time_fe.txt', 'w') as fh:
    fh.write(result.summary.as_latex())
print(result)


# Rental price post COVID with DID
res = ols('rental_price ~ post_covid + central_marker + post_covid*central_marker + flat_type', data=df).fit()
with open('rental_did_post_covid_no_fe.txt', 'w') as fh:
    fh.write(res.summary().as_text())
print(res.summary())

# Apply time FE
result = PanelOLS(time_fe_df.rental_price, time_fe_df[['post_covid', 'post_covid_central_marker', 'central_marker', 'flat_type']],
              time_effects=True,
              entity_effects=False
              ).fit()
with open('rental_did_post_covid_time_fe.txt', 'w') as fh:
    fh.write(result.summary.as_latex())
print(result)

                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.689
Model:                            OLS   Adj. R-squared:                  0.689
Method:                 Least Squares   F-statistic:                 4.749e+04
Date:                Fri, 19 Apr 2024   Prob (F-statistic):               0.00
Time:                        17:09:41   Log-Likelihood:            -2.4622e+06
No. Observations:              192523   AIC:                         4.924e+06
Df Residuals:                  192513   BIC:                         4.925e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

                          PanelOLS Estimation Summary                           
Dep. Variable:           rental_price   R-squared:                        0.5919
Estimator:                   PanelOLS   R-squared (Between):              0.5979
No. Observations:              192523   R-squared (Within):               0.1642
Date:                Fri, Apr 19 2024   R-squared (Overall):              0.5783
Time:                        17:09:43   Log-likelihood                -1.287e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   3.988e+04
Entities:                         503   P-value                           0.0000
Avg Obs:                       382.75   Distribution:                F(7,192507)
Min Obs:                       1.0000                                           
Max Obs:                       3086.0   F-statistic (robust):          3.988e+04
                            

In [None]:
import matplotlib.pyplot as plt

x_axis = ["0", "1"]
y_axis = ["value_1", "value_2", "value_3", ...]

plt.plot(x_axis, y_axis)
plt.title("title name")
plt.xlabel("x_axis name")
plt.ylabel("y_axis name")
plt.show()