In [1]:
# Import all dependencies for multiple linear regression modeling. Full sample code from ChatGPT
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Read in the data
df = pd.read_csv('/Users/helenamabey/Stats_Spring_2025/Real_estate.csv')
df.head()

Unnamed: 0,No,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [5]:
# Update date format
from datetime import datetime, timedelta

def decimal_year_to_date(decimal_year):
    year = int(decimal_year)
    remainder = decimal_year - year
    start_of_year = datetime(year, 1, 1)
    days_in_year = (datetime(year + 1, 1, 1) - start_of_year).days
    actual_date = start_of_year + timedelta(days=remainder * days_in_year)
    return actual_date.strftime("%Y-%m-%d")

df['Transaction date'] = [decimal_year_to_date(d) for d in df['Transaction date']]

df.head()

Unnamed: 0,No,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,1,2012-12-01,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012-12-01,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013-08-01,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013-07-02,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012-10-31,5.0,390.5684,5,24.97937,121.54245,43.1


In [7]:
# Correct date data type
df['Transaction date'] = pd.to_datetime(df['Transaction date'])
df.head()

Unnamed: 0,No,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,1,2012-12-01,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012-12-01,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013-08-01,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013-07-02,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012-10-31,5.0,390.5684,5,24.97937,121.54245,43.1


In [9]:
# Obtain the summary statistics on the full data set
df.describe()

Unnamed: 0,No,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,43.8,6488.021,10.0,25.01459,121.56627,117.5


## Multiple Linear Regression Model: Question 2 #9

In [11]:
# Obtain the summary statistics on the requested comparison features, House age, distance, and House price of unit area
df[['House age','House price of unit area','Distance to the nearest MRT station']].describe()

Unnamed: 0,House age,House price of unit area,Distance to the nearest MRT station
count,414.0,414.0,414.0
mean,17.71256,37.980193,1083.885689
std,11.392485,13.606488,1262.109595
min,0.0,7.6,23.38284
25%,9.025,27.7,289.3248
50%,16.1,38.45,492.2313
75%,28.15,46.6,1454.279
max,43.8,117.5,6488.021


In [13]:
# Compute correlation between age, distance, and price: 
correlation = df[['House age', 'House price of unit area','Distance to the nearest MRT station']].corr()
correlation

Unnamed: 0,House age,House price of unit area,Distance to the nearest MRT station
House age,1.0,-0.210567,0.025622
House price of unit area,-0.210567,1.0,-0.673613
Distance to the nearest MRT station,0.025622,-0.673613,1.0


In [15]:
# Define X (Independent Variables) and y (Target Variable)
X = df[['House age', 'Distance to the nearest MRT station']]
y = df['House price of unit area']

In [17]:
# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [19]:
# Initialize and fit the Linear Regression model
LR = LinearRegression()
LR.fit(X_train, y_train)

In [21]:
# Assign variables for Predictions
LR_Predictions_Train = LR.predict(X_train)
LR_Predictions_Test = LR.predict(X_test)

In [23]:
# Compute residuals and assign variables
residuals_train = y_train - LR_Predictions_Train
residuals_test = y_test - LR_Predictions_Test

In [25]:
# Model Performance Metrics 
# R^2 values for test and train
r2_train = r2_score(y_train, LR_Predictions_Train)
r2_test = r2_score(y_test, LR_Predictions_Test)

In [27]:
n_train, k = X_train.shape
n_test = X_test.shape[0]

In [29]:
# Obtain adjusted R^2 values for test and train
adj_r2_train = 1 - (1 - r2_train) * ((n_train - 1) / (n_train - k - 1))
adj_r2_test = 1 - (1 - r2_test) * ((n_test - 1) / (n_test - k - 1))

In [31]:
# Obtain Mean Square Error for test and train
mse_train = mean_squared_error(y_train, LR_Predictions_Train)
mse_test = mean_squared_error(y_test, LR_Predictions_Test)

In [33]:
# Obtain Root Mean Square Error for test and train
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

In [35]:
# Add constant for statsmodels OLS summary
X_train_with_const = sm.add_constant(X_train)
ols_model = sm.OLS(y_train, X_train_with_const).fit()
summary_table = ols_model.summary()

## Regression Model Comparison: Question 2 #10

### Multiple linear regression model results

In [37]:
# Print summary results all together
print("Scikit-learn Linear Regression Summary:")
print(f"Intercept: {LR.intercept_:.4f}")
print("Coefficients:")
print(pd.Series(LR.coef_, index=X_train.columns))

print("Training Set Performance:")
print(f"R-squared: {r2_train:.4f}")
print(f"Adjusted R-squared: {adj_r2_train:.4f}")
print(f"Mean Squared Error (MSE): {mse_train:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_train:.4f}")

print("Test Set Performance:")
print(f"R-squared: {r2_test:.4f}")
print(f"Adjusted R-squared: {adj_r2_test:.4f}")
print(f"Mean Squared Error (MSE): {mse_test:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_test:.4f}")

# Display full statsmodels-style summary
print("Statsmodels OLS Summary (Training Data):")
print()
print(summary_table)

Scikit-learn Linear Regression Summary:
Intercept: 50.2065
Coefficients:
House age                             -0.244615
Distance to the nearest MRT station   -0.007079
dtype: float64
Training Set Performance:
R-squared: 0.4750
Adjusted R-squared: 0.4713
Mean Squared Error (MSE): 106.8896
Root Mean Squared Error (RMSE): 10.3387
Test Set Performance:
R-squared: 0.5401
Adjusted R-squared: 0.5326
Mean Squared Error (MSE): 64.4566
Root Mean Squared Error (RMSE): 8.0285
Statsmodels OLS Summary (Training Data):

                               OLS Regression Results                               
Dep. Variable:     House price of unit area   R-squared:                       0.475
Model:                                  OLS   Adj. R-squared:                  0.471
Method:                       Least Squares   F-statistic:                     129.4
Date:                      Sun, 02 Mar 2025   Prob (F-statistic):           9.71e-41
Time:                              08:21:08   Log-Likelihood:  

In [41]:
# Import dependencies for Variance Inflation Factor to check if not over correlated. Results show that there is 
# no multicollinearity since the results are near 1 for the applicable features
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Add constant for VIF calculation
X_with_const = sm.add_constant(X)

# Calculate VIF for each predictor
vif_data = pd.DataFrame()
vif_data["Feature"] = X_with_const.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]

# Display VIF values
print(vif_data)

                               Feature       VIF
0                                const  4.095876
1                            House age  1.000657
2  Distance to the nearest MRT station  1.000657


In [45]:
from scipy import stats

# Given values (from statsmodels output)
beta_1 = -0.244615  # House Age
std_err_1 = 0.053
beta_2 = -0.007079  # Distance to MRT
std_err_2 = 0.00047
df = len(X_train) - 2  # Degrees of freedom (n - k - 1, k=2 for two predictors)

# Compute t-statistic and p-value for both predictors
t_statistic_1 = beta_1 / std_err_1
p_value_1 = 2 * (1 - stats.t.cdf(abs(t_statistic_1), df))

t_statistic_2 = beta_2 / std_err_2
p_value_2 = 2 * (1 - stats.t.cdf(abs(t_statistic_2), df))

print("Multiple Regression - House Age & Distance to MRT")
print(f"House Age: T-statistic = {t_statistic_1:.3f}, P-value = {p_value_1:.6f}")
print(f"Distance to MRT: T-statistic = {t_statistic_2:.3f}, P-value = {p_value_2:.6f}")

Multiple Regression - House Age & Distance to MRT
House Age: T-statistic = -4.615, P-value = 0.000006
Distance to MRT: T-statistic = -15.062, P-value = 0.000000
