In [22]:
import os
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/ds/cleaned_final.csv"
rseed = 42
TFRAC = 0.8


GD_LR0 = 1e-3
GD_EPOCHS = 15000
GD_VERBOSE = False


df = pd.read_csv(path)
# ---------------------------
# 3. Basic cleaning (keep original behavior)
# ---------------------------
if 'floor_covering' in df.columns:
    df['floor_covering'] = df['floor_covering'].fillna(df['floor_covering'].mode().iloc[0])

# ---------------------------
# 4. Controlled feature engineering
# ---------------------------
# Log transforms to linearize skewed numeric features
df['log_sqft'] = np.log1p(df['sqrt_ft'].fillna(0))
df['log_acres'] = np.log1p(df['lot_acres'].fillna(0))
# Single quadratic term (only one) to capture curvature
df['sqft_sq'] = df['sqrt_ft'].fillna(0) ** 2

# era: if year_built exists, create era buckets (same as your earlier implementation)
if 'year_built' in df.columns:
    df['era'] = pd.cut(
        df['year_built'],
        bins=[1800, 1945, 1970, 1990, 2000, 2010, 2030],
        labels=[0, 1, 2, 3, 4, 5]
    ).astype(int)
else:
    df['era'] = 0


numeric_features = [
    "bathrooms",
    "garage",
    "log_sqft",
    "log_acres",
    "HOA",
    "sqft_sq"
]

# we'll replace the raw 'era' with era_median (target-encoding)
categorical_features = ["era"]

target_col = "sold_price"

required = numeric_features + categorical_features + [target_col]

# Create X, y (X is DataFrame for convenience)
X_all = df[numeric_features + categorical_features].astype(float).reset_index(drop=True)
y_all = df[target_col].astype(float).reset_index(drop=True)

# ---------------------------
# 7. Randomized train/test split using NumPy only
# ---------------------------
np.random.seed(rseed)
indices = np.arange(len(X_all))
np.random.shuffle(indices)

train_size = int(TFRAC * len(indices))
train_idx = indices[:train_size]
test_idx  = indices[train_size:]

X_train = X_all.iloc[train_idx].reset_index(drop=True)
X_test  = X_all.iloc[test_idx].reset_index(drop=True)
y_train = y_all.iloc[train_idx].reset_index(drop=True)
y_test  = y_all.iloc[test_idx].reset_index(drop=True)

print("Train/Test sizes:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# ---------------------------
# 8. Era target encoding (use training medians only)
# ---------------------------
era_median_map = y_train.groupby(X_train['era']).median().to_dict()
global_median = y_train.median()
# Map to train and test (test uses train mapping, fallback to global median)
X_train['era_median'] = X_train['era'].map(era_median_map).fillna(global_median)
X_test['era_median']  = X_test['era'].map(era_median_map).fillna(global_median)

# Drop raw era column now (we have era_median)
X_train = X_train.drop(columns=['era'])
X_test  = X_test.drop(columns=['era'])

# Add simple interaction: bathrooms * log_sqft (one extra predictive feature)
X_train['bath_x_logsqft'] = X_train['bathrooms'].fillna(0) * X_train['log_sqft'].fillna(0)
X_test['bath_x_logsqft']  = X_test['bathrooms'].fillna(0) * X_test['log_sqft'].fillna(0)

# ---------------------------
# 9. Feature scaling (fit on train, apply to test)
# ---------------------------
# Use mean/std (ddof=0) and replace zero std with 1
X_mean = X_train.mean()
X_std  = X_train.std(ddof=0).replace(0, 1.0)

X_train_scaled = (X_train - X_mean) / X_std
X_test_scaled  = (X_test - X_mean) / X_std

# ---------------------------
# 10. Target transform: log1p then standardize (train stats only)
# ---------------------------
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

y_log_mean = y_train_log.mean()
y_log_std  = y_train_log.std(ddof=0) if y_train_log.std(ddof=0) != 0 else 1.0

y_train_stdlog = (y_train_log - y_log_mean) / y_log_std
y_test_stdlog  = (y_test_log - y_log_mean) / y_log_std  # use training mean/std

# ---------------------------
# 11. Utility metrics (operate on original target scale)
# ---------------------------
def MAE(actual, pred):
    actual = np.array(actual).flatten()
    pred   = np.array(pred).flatten()
    return np.mean(np.abs(actual - pred))

def R2(actual, pred):
    actual = np.array(actual).flatten()
    pred   = np.array(pred).flatten()
    ss_res = np.sum((actual - pred) ** 2)
    ss_tot = np.sum((actual - np.mean(actual)) ** 2)
    return 1 - ss_res / ss_tot if ss_tot != 0 else np.nan

# ---------------------------
# 12. Gradient Descent Linear Regression (NumPy)
#    - operates on standardized log-target (y_train_stdlog)
# ---------------------------
class MVLinearRegression:
    def __init__(self):
        self.w = None  # shape (d,1)

    def fit(self, X_df, y_stdlog, lr0=GD_LR0, epochs=GD_EPOCHS, verbose=False):
        Xb = np.hstack([np.ones((len(X_df), 1)), X_df.values])  # bias + features
        yv = y_stdlog.values.reshape(-1, 1)

        n, d = Xb.shape
        self.w = np.zeros((d, 1), dtype=float)

        for epoch in range(int(epochs)):
            y_pred = Xb @ self.w
            grad = (2.0 / n) * (Xb.T @ (y_pred - yv))
            # simple learning rate schedule
            lr = lr0 / (1.0 + epoch / 5000.0)
            self.w -= lr * grad

            if verbose and (epoch % (max(1, int(epochs // 5))) == 0):
                loss = np.mean((y_pred - yv) ** 2)
                print(f"[GD] epoch {epoch}/{int(epochs)} loss={loss:.6e}")

    def predict(self, X_df):
        Xb = np.hstack([np.ones((len(X_df), 1)), X_df.values])
        return (Xb @ self.w).flatten()

# ---------------------------
# 13. Stable Ridge (NumPy closed form)
#    - solves for w in standardized log-target space
# ---------------------------
def fit_ridge(X_df, y_stdlog, lam=1.0, jitter=1e-6):
    Xb = np.hstack([np.ones((len(X_df), 1)), X_df.values])
    yv = y_stdlog.values.reshape(-1, 1)

    d = Xb.shape[1]
    I = np.eye(d)
    I[0, 0] = 0.0  # do not regularize bias

    A = Xb.T @ Xb + lam * I + jitter * np.eye(d)
    b = Xb.T @ yv

    w = np.linalg.solve(A, b)
    return w



# ---------------------------
# 14. Train models
# ---------------------------
# GD model
gd_model = MVLinearRegression()
gd_model.fit(X_train_scaled, y_train_stdlog, lr0=GD_LR0, epochs=GD_EPOCHS, verbose=GD_VERBOSE)

w_ridge = fit_ridge(X_train_scaled, y_train_stdlog, lam=1.0)
# GD predictions (std log)
y_pred_stdlog_gd = gd_model.predict(X_test_scaled)  # standardized log predictions
# unstandardize -> log space
y_pred_log_gd = (y_pred_stdlog_gd * y_log_std) + y_log_mean
# invert log1p
y_pred_gd = np.expm1(y_pred_log_gd)
# Train predictions (for reporting)
y_train_pred_stdlog_gd = gd_model.predict(X_train_scaled)
y_train_pred_log_gd = (y_train_pred_stdlog_gd * y_log_std) + y_log_mean
y_train_pred_gd = np.expm1(y_train_pred_log_gd)



#predictions
print("Train MAE:", MAE(y_train, y_train_pred_gd))
print("Train R2 :", R2(y_train, y_train_pred_gd))
print("Test MAE:", MAE(y_test, y_pred_gd))
print("Test R2 :", R2(y_test, y_pred_gd))






Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded CSV with shape: (5000, 16)
Train/Test sizes: (4000, 7) (1000, 7) (4000,) (1000,)
Train MAE: 149869.5670333072
Train R2 : 0.3533255734575861
Test MAE: 147719.6955867197
Test R2 : 0.32174986073238154
