In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import time

# Load the California housing dataset
california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target)

# Step 2: Preprocessing
# Handling missing values (not applicable here, but let's include a step)
X.fillna(X.mean(), inplace=True)

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Building Linear Regression from Scratch
class LinearRegressionScratch:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.theta = None
    
    def fit(self, X, y):
        # Number of training examples
        m = len(y)
        # Adding bias term
        X_b = np.c_[np.ones((m, 1)), X]  # Add bias term
        self.theta = np.random.randn(X_b.shape[1])  # Random initialization
        
        # Gradient Descent
        for _ in range(self.n_iterations):
            gradients = 2/m * X_b.T.dot(X_b.dot(self.theta) - y)
            self.theta -= self.learning_rate * gradients
    
    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add bias term for prediction
        return X_b.dot(self.theta)

# Train Linear Regression from Scratch
start_time = time.time()
model_scratch = LinearRegressionScratch(learning_rate=0.01, n_iterations=1000)
model_scratch.fit(X_train, y_train)
y_pred_scratch = model_scratch.predict(X_test)
end_time = time.time()

# Calculate metrics for the scratch model
mse_scratch = mean_squared_error(y_test, y_pred_scratch)
r2_scratch = r2_score(y_test, y_pred_scratch)

print(f"Scratch Model - MSE: {mse_scratch:.2f}, R²: {r2_scratch:.2f}")
print(f"Training Time for Scratch Model: {end_time - start_time:.4f} seconds")

# Step 4: Building the Same Model Using Standard Libraries
start_time = time.time()
model_sklearn = LinearRegression()
model_sklearn.fit(X_train, y_train)
y_pred_sklearn = model_sklearn.predict(X_test)
end_time = time.time()

# Calculate metrics for the sklearn model
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print(f"Sklearn Model - MSE: {mse_sklearn:.2f}, R²: {r2_sklearn:.2f}")
print(f"Training Time for Sklearn Model: {end_time - start_time:.4f} seconds")

# Step 5: Comparison of Results
print("\nComparison of Scratch vs Sklearn Model:")
print(f"Mean Squared Error: Scratch = {mse_scratch:.2f}, Sklearn = {mse_sklearn:.2f}")
print(f"R² Score: Scratch = {r2_scratch:.2f}, Sklearn = {r2_sklearn:.2f}")


Scratch Model - MSE: 0.55, R²: 0.58
Training Time for Scratch Model: 1.4206 seconds
Sklearn Model - MSE: 0.56, R²: 0.58
Training Time for Sklearn Model: 0.0525 seconds

Comparison of Scratch vs Sklearn Model:
Mean Squared Error: Scratch = 0.55, Sklearn = 0.56
R² Score: Scratch = 0.58, Sklearn = 0.58
