In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

housing_data = pd.read_csv('kc_house_data.csv')
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

train_data = train_data.loc[:, ~train_data.columns.str.contains("^unnamed", case=False)]
test_data  = test_data.loc[:, ~test_data.columns.str.contains("^unnamed", case=False)]


X_train = train_data['sqft_living'].to_numpy().reshape(-1, 1)
Y_train = train_data['price']/1000

X_test = test_data['sqft_living'].to_numpy().reshape(-1, 1)
Y_test = test_data['price']/1000

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Question 2

In [2]:
def ridge_gradient_descent(X, y, alpha, num_iters, lam, theta0=None, penalize_intercept=False):
    m, d = X.shape
    theta = np.zeros(d) if theta0 is None else theta0.copy()

    for _ in range(num_iters):
        preds = X @ theta
        grad = (2/m) * (X.T @ (preds - y))

        if penalize_intercept:
            reg_grad = 2 * lam * theta
        else:
            reg_grad = 2 * lam * theta
            reg_grad[0] = 0  # don't penalize intercept

        theta = theta - alpha * (grad + reg_grad)

    return theta

Question 3

In [3]:
np.random.seed(0)

N = 1000
X = np.random.uniform(-2, 2, size=N)
e = np.random.normal(0, np.sqrt(2), size=N)  
Y = 1 + 2*X + e

X_design = np.column_stack([np.ones(N), X])  # [1, X]
y = Y

theta_ols, *_ = np.linalg.lstsq(X_design, y, rcond=None)

def ridge_closed_form(X, y, lam, penalize_intercept=False):
    d = X.shape[1]
    if penalize_intercept:
        L = np.eye(d)
    else:
        L = np.eye(d)
        L[0, 0] = 0
    theta = np.linalg.solve(X.T @ X + lam * L, X.T @ y)
    return theta

lambdas = [0, 1, 10, 100, 1000, 10000]
rows = []

for lam in lambdas:
    if lam == 0:
        theta = theta_ols
        label = "OLS"
    else:
        theta = ridge_closed_form(X_design, y, lam, penalize_intercept=False)
        label = f"Ridge λ={lam}"

    y_pred = X_design @ theta
    rows.append({
        "Model": label,
        "Intercept": theta[0],
        "Slope": theta[1],
        "MSE": mean_squared_error(y, y_pred),
        "R^2": r2_score(y, y_pred)
    })

results = pd.DataFrame(rows)
print(results)

           Model  Intercept     Slope       MSE       R^2
0            OLS   1.040515  1.965692  1.865374  0.736759
1      Ridge λ=1   1.040491  1.964238  1.865377  0.736759
2     Ridge λ=10   1.040279  1.951251  1.865656  0.736720
3    Ridge λ=100   1.038305  1.830236  1.890166  0.733261
4   Ridge λ=1000   1.026876  1.129641  2.809812  0.603481
5  Ridge λ=10000   1.012264  0.233982  5.917267  0.164958
