In [5]:
import numpy as np 
import pandas as pd 

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score

In [3]:
url = "USA_Housing.csv"
data = pd.read_csv(url)

In [9]:
X = data.drop(columns = ["Price"])
Y = data["Price"]

In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [19]:
kf = KFold(n_splits = 5, shuffle = True, random_state=42)
best_r2 = -np.inf
best_beta = None

In [29]:
print("---- 5-Fold Cross Validation ----")
fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
    # Step (d): Least Squares Regression (Normal Equation)
    # beta = (X^T X)^(-1) X^T y
    X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]  # add bias term
    X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]
    
    beta = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ Y_train
    
    Y_pred = X_test_b @ beta
    r2 = r2_score(Y_test, Y_pred)
    
    print(f"Fold {fold}: R2 Score = {r2:.4f}")
    
    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta
    fold += 1

print("\nBest R2 Score:", best_r2)
print("Best Beta Matrix (coefficients):\n", best_beta)

---- 5-Fold Cross Validation ----
Fold 1: R2 Score = 0.9180
Fold 2: R2 Score = 0.9146
Fold 3: R2 Score = 0.9116
Fold 4: R2 Score = 0.9193
Fold 5: R2 Score = 0.9244

Best R2 Score: 0.9243869413350317
Best Beta Matrix (coefficients):
 [1.23161736e+06 2.30225051e+05 1.63956839e+05 1.21115120e+05
 7.83467170e+02 1.50662447e+05]


In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.3, random_state=42)

In [23]:
X_train_b = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_b = np.c_[np.ones(X_test.shape[0]), X_test]

In [25]:
beta_final = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ Y_train

Y_pred_final = X_test_b @ beta_final
final_r2 = r2_score(Y_test, Y_pred_final)

print("\nFinal Model Performance on 30% Test Data:")
print("R2 Score =", final_r2)


Final Model Performance on 30% Test Data:
R2 Score = 0.9146818498916266
