In [3]:
def price_sq(x):
  #if div by 20 then x 50
  return x//50

In [5]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
# -------------------------------------------------------------
# Analytical Linear Regression (Normal Equation)
# -------------------------------------------------------------
df = pd.read_csv("/content/drive/MyDrive/ds/cleaned_final.csv")
# -------------------------------------------------------------
# Analytical Linear Regression (Normal Equation)
# -------------------------------------------------------------
class NormalEqLinearRegression():

    def fit(self, X, y):
        # Normalize features
        self.mean = X.mean(axis=0)
        self.std  = X.std(axis=0) + 1e-8
        Xn = (X - self.mean) / self.std

        # Add bias term
        Xn = np.hstack([np.ones((X.shape[0], 1)), Xn])

        # Closed-form solution: W = (XᵀX)⁻¹ Xᵀ y
        XtX = Xn.T @ Xn
        Xty = Xn.T @ y

        self.W = np.linalg.solve(XtX, Xty)   # stable solve()

    def predict(self, X):
        Xn = (X - self.mean) / self.std
        Xn = np.hstack([np.ones((X.shape[0], 1)), Xn])
        return Xn @ self.W

    def r2(self, y, y_hat):
        ss_tot = np.sum((y - y.mean())**2)
        ss_res = np.sum((y - y_hat)**2)
        return 1 - ss_res/ss_tot

    def mae(self, y, y_hat):
        return np.mean(np.abs(y - y_hat))


# -------------------------------------------------------------
# Train/test split (recommended)
# -------------------------------------------------------------
from sklearn.model_selection import train_test_split
df['price/sqrt_ft']=df['sold_price']/df['sqrt_ft']
df['cat']=price_sq(df['price/sqrt_ft'])
X = df[['cat','bathrooms','sqrt_ft']]
y = df['sold_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Log-transform target
y_train_scaled = np.log1p(y_train)
y_test_scaled  = np.log1p(y_test)

# -------------------------------------------------------------
# Train model using closed-form solution
# -------------------------------------------------------------
model = NormalEqLinearRegression()
model.fit(X_train, y_train_scaled)

# -------------------------------------------------------------
# Predictions
# -------------------------------------------------------------
y_hat_train_scaled = model.predict(X_train)
y_hat_test_scaled  = model.predict(X_test)

# Convert back to original dollars
y_hat_train = np.expm1(y_hat_train_scaled)
y_hat_test  = np.expm1(y_hat_test_scaled)

# -------------------------------------------------------------
# Metrics
# -------------------------------------------------------------
print("Train R2:", model.r2(y_train_scaled, y_hat_train_scaled))
print("Train MAE:", model.mae(y_train, y_hat_train))

print("Test R2:", model.r2(y_test_scaled, y_hat_test_scaled))
print("Test MAE:", model.mae(y_test, y_hat_test))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train R2: 0.8437783780902237
Train MAE: 75795.87738788972
Test R2: 0.7315327730881078
Test MAE: 98403.22328888066
