In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

housing_data = pd.read_csv('kc_house_data.csv')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

train_data = train_data.loc[:, ~train_data.columns.str.contains("^unnamed", case=False)]
test_data  = test_data.loc[:, ~test_data.columns.str.contains("^unnamed", case=False)]


X_train = train_data.drop(columns=['id', 'date', 'zipcode', 'price'], errors='ignore')
Y_train = train_data['price']/1000

X_test = test_data.drop(columns=['id', 'date', 'zipcode', 'price'], errors='ignore')
Y_test = test_data['price']/1000

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
def add_intercept(X):
    ones = np.ones((X.shape[0], 1))
    return np.hstack((ones, X))

X_train_closeform = add_intercept(X_train_scaled)
X_test_closeform = add_intercept(X_test_scaled)

def closed_form_solution(X, y):
    # Used AI to fix equation, used np.linalg.inv(X.T @ X) @ X.T @ y before but was getting very different results from problem 2
    theta, *_ = np.linalg.lstsq(X, y.to_numpy(), rcond=None) 
    theta = closed_form_solution(X_train_closeform, Y_train)
    return theta

def predict(X, theta):
    return X @ theta

y_train_pred_closeform = predict(X_train_closeform, theta)

train_mse_closeform = mean_squared_error(Y_train, y_train_pred_closeform)
train_r2_closeform  = r2_score(Y_train, y_train_pred_closeform)

print('Closed form training MSE: ', train_mse_closeform)
print('Closed form training R^2: ', train_r2_closeform)

y_test_pred_closeform = predict(X_test_closeform, theta)

test_mse_closeform = mean_squared_error(Y_test, y_test_pred_closeform)
test_r2_closeform  = r2_score(Y_test, y_test_pred_closeform)

print('Closed form test MSE: ', test_mse_closeform)
print('Closed form test R^2: ', test_r2_closeform)


Closed form training MSE:  31486.16777579488
Closed form training R^2:  0.7265334318706018
Closed form test MSE:  57628.1547056704
Closed form test R^2:  0.6543560876120953


Comparison to Problem 2

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, Y_train)

y_train_pred = model.predict(X_train_scaled)
train_mse = mean_squared_error(Y_train, y_train_pred)
train_r2 = r2_score(Y_train, y_train_pred)

print(f'Training MSE: ', train_mse)
print(f'Training R^2: ', train_r2)

y_test_pred = model.predict(X_test_scaled)
test_mse = mean_squared_error(Y_test, y_test_pred)
test_r2 = r2_score(Y_test, y_test_pred)

print(f'Test MSE: ', test_mse)
print(f'Test R^2: ', test_r2)


Training MSE:  31486.16777579488
Training R^2:  0.7265334318706018
Test MSE:  57628.154705670386
Test R^2:  0.6543560876120954
