In [70]:
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize
from statsmodels.formula.api import ols
from tabulate import tabulate

In [242]:
def load_hdb_resale_price():
    used_columns = ['year','town', 'covid','flat_type', 'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price']
    df1 = pd.read_csv('data/ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016.csv')
    df2 = pd.read_csv('data/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv')
    df = pd.concat([df1, df2])
    df['year'] = df['month'].apply(lambda x: int(x.split('-')[0]))
    df['month'] = df['month'].apply(lambda x: int(x.split('-')[-1]))
    df['flat_type'] = df['flat_type'].apply(lambda x: x.lower())
    df['town'] = df['town'].apply(lambda x: x.lower())
    df['covid'] = df.apply(lambda x: 1 if x['year']>=2020 and x['month']>4 else 0, axis=1)
    return df[used_columns]

def load_hdb_rental_price():
    hdb_rental_price = pd.read_excel('data/HDB Rental Price (from 2007-3qtr to 2023-3qtr).xlsx')
    hdb_rental_price.columns = hdb_rental_price.iloc[0]
    hdb_rental_price = hdb_rental_price[1:]
    del hdb_rental_price[hdb_rental_price.columns[0]]
      
    hdb_rental_price = hdb_rental_price.melt(
    id_vars=['Town', 'Year '],
    value_vars=['1-Room', '2-Room', '3-Room', '4-Room','5-Room', 'Executive'],
    var_name='Room Type', value_name='Price')
    
    hdb_rental_price = hdb_rental_price[(hdb_rental_price['Price'] != '-') & (hdb_rental_price['Price'] != '*')]
    hdb_rental_price['Room Type'] = hdb_rental_price['Room Type'].apply(lambda x: x.replace('-', ' ').lower())
    hdb_rental_price = hdb_rental_price.rename(columns={'Year ': 'Year'})
    hdb_rental_price['Price'] = hdb_rental_price['Price'].apply(lambda x: int(x.replace('$', '').replace('.','').replace(',','')))
    hdb_rental_price['Town'] = hdb_rental_price['Town'].apply(lambda x: x.lower())
    
    df = hdb_rental_price.groupby(['Town','Year', 'Room Type']).mean()
    df = df.reset_index()
    
    return df[df['Year']>=2015]

def combine_rental_and_resale_price():
    resale = load_hdb_resale_price()
    rental = load_hdb_rental_price()
    
    combined_df = resale.merge(rental, left_on=['year', 'town', 'flat_type'], right_on=['Year', 'Town', 'Room Type'])
    
    used_columns=list(resale.columns)
    used_columns.append('rental_price')
    combined_df = combined_df.rename(columns={'Price': 'rental_price'})
    return combined_df[used_columns]

In [243]:
hdb_resale_price = load_hdb_resale_price()
hdb_rental_price = load_hdb_rental_price()
df = combine_rental_and_resale_price()

In [244]:
df

Unnamed: 0,year,town,covid,flat_type,floor_area_sqm,flat_model,lease_commence_date,resale_price,rental_price
0,2015,ang mo kio,0,3 room,60.0,Improved,1986,255000.0,1925.0
1,2015,ang mo kio,0,3 room,68.0,New Generation,1981,275000.0,1925.0
2,2015,ang mo kio,0,3 room,69.0,New Generation,1980,285000.0,1925.0
3,2015,ang mo kio,0,3 room,68.0,New Generation,1979,290000.0,1925.0
4,2015,ang mo kio,0,3 room,68.0,New Generation,1980,290000.0,1925.0
...,...,...,...,...,...,...,...,...,...
192518,2023,yishun,1,executive,142.0,Apartment,1987,788888.0,3225.0
192519,2023,yishun,1,executive,146.0,Maisonette,1987,838000.0,3225.0
192520,2023,yishun,1,executive,142.0,Apartment,1987,755000.0,3225.0
192521,2023,yishun,1,executive,142.0,Apartment,1988,780000.0,3225.0


In [245]:
res = ols('resale_price ~ covid + town + flat_type + floor_area_sqm', data=df).fit()

In [246]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           resale_price   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.613
Method:                 Least Squares   F-statistic:                 1.087e+04
Date:                Sun, 14 Apr 2024   Prob (F-statistic):               0.00
Time:                        14:42:27   Log-Likelihood:            -2.4835e+06
No. Observations:              192523   AIC:                         4.967e+06
Df Residuals:                  192494   BIC:                         4.967e+06
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept               1.38