In [10]:
!pip install numpy pandas matplotlib scikit-learn torch wandb



In [12]:
from sklearn.datasets import fetch_california_housing, load_iris
import pandas as pd

# Load datasets
california = fetch_california_housing()
iris = load_iris()

# Convert to DataFrames (for easier handling)
california_df = pd.DataFrame(california.data, columns=california.feature_names)
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

print("California Housing Dataset:\n", california_df.head())
print("\nIris Dataset (Classification):\n", iris_df.head())


California Housing Dataset:
    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

Iris Dataset (Classification):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4          

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# Prepare data
X = california.data
y = california.target

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (critical for Gradient Descent)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Add bias term (intercept)
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

In [16]:
def batch_gradient_descent(X, y, learning_rate=0.01, epochs=1000):
    theta = np.zeros(X.shape[1])  # Initialize weights
    m = len(y)                   # Number of samples

    for epoch in range(epochs):
        gradient = (1/m) * X.T.dot(X.dot(theta) - y)
        theta -= learning_rate * gradient
        if epoch % 100 == 0:
            loss = np.mean((X.dot(theta) - y) ** 2)
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return theta

theta_batch = batch_gradient_descent(X_train, y_train)

Epoch 0, Loss: 5.5300
Epoch 100, Loss: 1.2906
Epoch 200, Loss: 0.7102
Epoch 300, Loss: 0.6187
Epoch 400, Loss: 0.5952
Epoch 500, Loss: 0.5827
Epoch 600, Loss: 0.5732
Epoch 700, Loss: 0.5652
Epoch 800, Loss: 0.5584
Epoch 900, Loss: 0.5526


In [17]:
def stochastic_gradient_descent(X, y, learning_rate=0.01, epochs=10):
    theta = np.zeros(X.shape[1])
    m = len(y)

    for epoch in range(epochs):
        for i in range(m):
            random_idx = np.random.randint(m)
            xi = X[random_idx:random_idx+1]
            yi = y[random_idx:random_idx+1]
            gradient = xi.T.dot(xi.dot(theta) - yi)
            theta -= learning_rate * gradient
        loss = np.mean((X.dot(theta) - y) ** 2)
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return theta

theta_sgd = stochastic_gradient_descent(X_train, y_train)

Epoch 0, Loss: 83089740333437.4219
Epoch 1, Loss: 2709436668474954240.0000
Epoch 2, Loss: 215500783293900110521760038277611520.0000
Epoch 3, Loss: 5995712062525167513402413664527320642922090384236612681728.0000
Epoch 4, Loss: 2484570317900802399534651676835908119936723980243246134088171520.0000
Epoch 5, Loss: 930383876453167826657065601701807422264932130995711307430726734543295067717632.0000
Epoch 6, Loss: 30048321488197100396717198290278676547118724460425136997438895340525623095026107023360.0000
Epoch 7, Loss: 7871242120461777333456377121059425376044538412441013149239536872363890748914736792247331717120.0000
Epoch 8, Loss: 629935055275031152277882001235560250448982095580445708685398469455531333000268831005204092876749327313141760.0000
Epoch 9, Loss: 714130612631934034727119933884755492635571242658701894496049168094613467488996845844532227870031599862950396816451843391488.0000


In [18]:
def mini_batch_gradient_descent(X, y, learning_rate=0.01, epochs=100, batch_size=32):
    theta = np.zeros(X.shape[1])
    m = len(y)

    for epoch in range(epochs):
        shuffled_indices = np.random.permutation(m)
        X_shuffled = X[shuffled_indices]
        y_shuffled = y[shuffled_indices]

        for i in range(0, m, batch_size):
            xi = X_shuffled[i:i+batch_size]
            yi = y_shuffled[i:i+batch_size]
            gradient = (1/batch_size) * xi.T.dot(xi.dot(theta) - yi)
            theta -= learning_rate * gradient
        loss = np.mean((X.dot(theta) - y) ** 2)
        print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return theta

theta_minibatch = mini_batch_gradient_descent(X_train, y_train)

Epoch 0, Loss: 0.5829
Epoch 1, Loss: 0.5464
Epoch 2, Loss: 0.5327
Epoch 3, Loss: 0.5262
Epoch 4, Loss: 0.5240
Epoch 5, Loss: 0.5200
Epoch 6, Loss: 0.5250
Epoch 7, Loss: 0.5228
Epoch 8, Loss: 0.5408
Epoch 9, Loss: 0.5187
Epoch 10, Loss: 0.5281
Epoch 11, Loss: 0.5196
Epoch 12, Loss: 0.5211
Epoch 13, Loss: 0.5198
Epoch 14, Loss: 0.5182
Epoch 15, Loss: 0.5189
Epoch 16, Loss: 0.5186
Epoch 17, Loss: 0.5278
Epoch 18, Loss: 0.5190
Epoch 19, Loss: 0.5210
Epoch 20, Loss: 0.5192
Epoch 21, Loss: 0.5197
Epoch 22, Loss: 0.5200
Epoch 23, Loss: 0.5205
Epoch 24, Loss: 0.5181
Epoch 25, Loss: 0.5211
Epoch 26, Loss: 0.5198
Epoch 27, Loss: 0.5287
Epoch 28, Loss: 0.5185
Epoch 29, Loss: 0.5241
Epoch 30, Loss: 0.5272
Epoch 31, Loss: 0.5189
Epoch 32, Loss: 0.5183
Epoch 33, Loss: 0.5184
Epoch 34, Loss: 0.5228
Epoch 35, Loss: 0.5221
Epoch 36, Loss: 0.5216
Epoch 37, Loss: 0.5187
Epoch 38, Loss: 0.5193
Epoch 39, Loss: 0.5199
Epoch 40, Loss: 0.5291
Epoch 41, Loss: 0.5198
Epoch 42, Loss: 0.5256
Epoch 43, Loss: 0.520

In [19]:
def batch_gd_with_regularization(X, y, learning_rate=0.01, epochs=1000, lambda_=0.1, reg_type='l2'):
    theta = np.zeros(X.shape[1])
    m = len(y)

    for epoch in range(epochs):
        gradient = (1/m) * X.T.dot(X.dot(theta) - y)
        if reg_type == 'l2':
            gradient += lambda_ * theta  # L2 penalty
        theta -= learning_rate * gradient
        if epoch % 100 == 0:
            loss = np.mean((X.dot(theta) - y) ** 2)
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return theta

theta_l2 = batch_gd_with_regularization(X_train, y_train, reg_type='l2')

Epoch 0, Loss: 5.5300
Epoch 100, Loss: 1.3881
Epoch 200, Loss: 0.8039
Epoch 300, Loss: 0.6892
Epoch 400, Loss: 0.6550
Epoch 500, Loss: 0.6402
Epoch 600, Loss: 0.6318
Epoch 700, Loss: 0.6262
Epoch 800, Loss: 0.6220
Epoch 900, Loss: 0.6188


In [20]:
def batch_gd_with_early_stopping(X, y, learning_rate=0.01, epochs=1000, patience=5):
    theta = np.zeros(X.shape[1])
    m = len(y)
    best_loss = np.inf
    patience_counter = 0

    for epoch in range(epochs):
        gradient = (1/m) * X.T.dot(X.dot(theta) - y)
        theta -= learning_rate * gradient
        loss = np.mean((X.dot(theta) - y) ** 2)

        if loss < best_loss:
            best_loss = loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")
    return theta

theta_early_stop = batch_gd_with_early_stopping(X_train, y_train)

Epoch 0, Loss: 5.5300
Epoch 100, Loss: 1.2906
Epoch 200, Loss: 0.7102
Epoch 300, Loss: 0.6187
Epoch 400, Loss: 0.5952
Epoch 500, Loss: 0.5827
Epoch 600, Loss: 0.5732
Epoch 700, Loss: 0.5652
Epoch 800, Loss: 0.5584
Epoch 900, Loss: 0.5526


In [22]:
def evaluate_model(theta, X_test, y_test):
    predictions = X_test.dot(theta)
    mse = np.mean((predictions - y_test) ** 2)
    print(f"Mean Squared Error (MSE): {mse:.4f}")

print("Batch GD Performance:")
evaluate_model(theta_batch, X_test, y_test)

print("\nSGD Performance:")
evaluate_model(theta_sgd, X_test, y_test)

print("\nMini-Batch GD Performance:")
evaluate_model(theta_minibatch, X_test, y_test)

Batch GD Performance:
Mean Squared Error (MSE): 0.5672

SGD Performance:
Mean Squared Error (MSE): 16165022897498629495739415829484114686291783232081008015506160928527625595162409208392602391268243922001037672060521807872.0000

Mini-Batch GD Performance:
Mean Squared Error (MSE): 0.5769


In [23]:
import joblib

# Save the model and scaler
joblib.dump(theta_batch, 'batch_gd_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Verify files are saved
!ls

batch_gd_model.pkl  ml_env  sample_data  scaler.pkl
