In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

housing_data = pd.read_csv('kc_house_data.csv')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

train_data = train_data.loc[:, ~train_data.columns.str.contains("^unnamed", case=False)]
test_data  = test_data.loc[:, ~test_data.columns.str.contains("^unnamed", case=False)]


X_train = train_data['sqft_living'].to_numpy().reshape(-1, 1)
Y_train = train_data['price']/1000

X_test = test_data['sqft_living'].to_numpy().reshape(-1, 1)
Y_test = test_data['price']/1000

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
def make_poly_design(x, p):
    x = np.asarray(x).reshape(-1)
    cols = [np.ones_like(x)]
    cols += [x**k for k in range(1, p+1)]
    return np.column_stack(cols)

def fit_least_squares(X, y):
    theta, *_ = np.linalg.lstsq(X, y, rcond=None)
    return theta

def predict(X, theta):
    return X @ theta

degrees = [1, 2, 3, 4, 5] 
rows = []

for p in degrees:
    Xtrain = make_poly_design(X_train, p)
    Xtest = make_poly_design(X_test,  p)

    theta = fit_least_squares(Xtrain, Y_train)

    Ytrain_pred = predict(Xtrain, theta)
    Ytest_pred = predict(Xtest, theta)

    rows.append({
        "p": p,
        "Train MSE": mean_squared_error(Y_train, Ytrain_pred),
        "Train R^2": r2_score(Y_train, Ytrain_pred),
        "Test MSE": mean_squared_error(Y_test, Ytest_pred),
        "Test R^2": r2_score(Y_test, Ytest_pred),
    })

results_df = pd.DataFrame(rows)
print(results_df)

   p     Train MSE  Train R^2      Test MSE    Test R^2
0  1  57947.526161   0.496709  8.857598e+04    0.468736
1  2  54822.665116   0.523849  7.179168e+04    0.569406
2  3  53785.194716   0.532860  9.983348e+04    0.401216
3  4  52795.850024   0.541453  2.497221e+05   -0.497791
4  5  54114.912599   0.529996  9.326506e+07 -558.388062
