In [4]:
import numpy as np
import torch

In [8]:
np.random.seed(45) # for consistently generating random no.(reproducable)
num_samples = 40

# Generating data
x1 = np.random.uniform(-1, 1, num_samples)
f_x = 3*x1 + 4
eps = np.random.randn(num_samples)
y = f_x + eps

# Ques 1
Use ```torch.autograd``` to find the true gradient on the above dataset using linear regression (in the form $\theta_1x + \theta_0$) for any given values of $(\theta_0,\theta_1)$.

# Using `torch.autograd` :
* When operations performed on tensors that have `requires_grad=True`(`theta_0, theta_1`), PyTorch creates a computation graph. Each tensor keeps track of how it was created (what operations were performed on it) and links back to the operations that generated it.
*  When `loss.backward()` is called, PyTorch traverses this computation graph from the output (`loss`) to the input (`theta_0, theta_1`) and computes the gradient of the loss with respect to each parameter that has `requires_grad=True`, using Chain Rule.
* After calling `.backward()`, the gradients for `theta_0` and `theta_1` are stored in their .grad attribute

In [40]:
import torch

# Setting random seed for reproducibility
torch.manual_seed(45)

# Converting the data to tensors
x1_tensor = torch.tensor(x1, dtype=torch.float32, requires_grad=True)
y_tensor = torch.tensor(y, dtype=torch.float32, requires_grad=True)

# Initializing parameters theta_0 and theta_1 randomly
theta_0 = torch.rand(1, dtype=torch.float32, requires_grad=True)  # Uniformly distributed between [0, 1)
theta_1 = torch.randn(1, dtype=torch.float32, requires_grad=True) # Normally distributed mean=0, std=1

# Defining the linear model
def linear_model(x):
    return theta_1 * x + theta_0

# Defining the loss function (Mean Squared Error)
def mse_loss(y_pred, y_true):
    return torch.mean((y_pred - y_true) ** 2)

# Computing the predicted values
y_pred = linear_model(x1_tensor)

# Computing the loss
loss = mse_loss(y_pred, y_tensor)

# Performing backpropagation to compute the gradients
loss.backward()

# Printing the gradients of theta_0 and theta_1
print(f"Gradient of theta_0: {theta_0.grad.item():.4f}")
print(f"Gradient of theta_1: {theta_1.grad.item():.4f}")
print(f"Loss: {loss.item():.4f}")

Gradient of theta_0: -7.0330
Gradient of theta_1: -1.2066
Loss: 15.5923


# Ques 2
Using the same $(\theta_0,\theta_1)$ as above, calculate the stochastic gradient for all points in the dataset. Then, find the average of all those gradients and show that the stochastic gradient is a good estimate of the true gradient.

In [41]:
torch.manual_seed(45)
# Initializing parameters theta_0 and theta_1 same as above
theta_0_sgd = torch.rand(1, dtype=torch.float32, requires_grad=True)  # Uniformly distributed between [0, 1)
theta_1_sgd = torch.randn(1, dtype=torch.float32, requires_grad=True) # Normally distributed mean=0, std=1

# Storing the gradients for each point
theta_0_grads = []
theta_1_grads = []

# Looping over each point in the dataset
for i in range(len(x1_tensor)):
    # Zeroing the gradients before each point calculation
    if theta_0_sgd.grad is not None:
        theta_0_sgd.grad.zero_()
        theta_1_sgd.grad.zero_()

    # Single data point (stochastic step)
    x_i = x1_tensor[i]
    y_i = y_tensor[i]

    # Forward pass (single point prediction)
    y_pred_i = theta_1_sgd * x_i + theta_0_sgd

    # Computing the loss for this single point
    loss_i = (y_pred_i - y_i) ** 2

    # Backpropagating the loss to compute gradients
    loss_i.backward()

    # Storing the gradients for theta_0 and theta_1
    theta_0_grads.append(theta_0_sgd.grad.item())
    theta_1_grads.append(theta_1_sgd.grad.item())

# Calculating the average gradients
theta_0_avg_grad = np.mean(np.array(theta_0_grads))
theta_1_avg_grad = np.mean(np.array(theta_1_grads))

# Printing the average stochastic gradients
print(f"Average stochastic gradient of theta_0: {theta_0_avg_grad:.4f}")
print(f"Average stochastic gradient of theta_1: {theta_1_avg_grad:.4f}")

# Comparing with true gradient from the previous batch gradient computation
print(f"True gradient of theta_0: {theta_0.grad.item():.4f}")
print(f"True gradient of theta_1: {theta_1.grad.item():.4f}")

Average stochastic gradient of theta_0: -7.0330
Average stochastic gradient of theta_1: -1.2066
True gradient of theta_0: -7.0330
True gradient of theta_1: -1.2066


* As observed from the computed values of the stochastic gradients and the true gradients for the parameters `theta_0` and `theta_1`, both are approximately equal when initialized randomly. This indicates that, under these conditions, the stochastic gradient provides a good estimate of the true gradient.
* Therefore, stochastic gradient descent (SGD) serves as an effective method for estimating the gradient and optimizing parameters, especially when computational efficiency is needed for large datasets.

# Using `numpy`

In [45]:
import numpy as np

# Number of samples
n = len(x1)

# Set random seed for reproducibility
np.random.seed(45)

# Initializing parameters theta_0 and theta_1
theta_0_sgd = np.random.rand(1).astype(np.float32)  # Uniformly distributed between [0, 1)
theta_1_sgd = np.random.randn(1).astype(np.float32) # Normally distributed mean=0, std=1

# Computing the predicted values
y_pred = theta_1_sgd * x1 + theta_0_sgd

# Computing the gradients manually
grad_theta_0 = -2/n * np.sum(y - y_pred) # Formula after applying chain rule
grad_theta_1 = -2/n * np.sum((y - y_pred) * x1) # Formula after applying chain rule

# Printing the calculated gradients
print(f"Gradient of theta_0 (manual calculation): {grad_theta_0:.4f}")
print(f"Gradient of theta_1 (manual calculation): {grad_theta_1:.4f}")

Gradient of theta_0 (manual calculation): -5.1731
Gradient of theta_1 (manual calculation): -2.2966


In [48]:
# Set random seed for reproducibility
np.random.seed(45)

# Initializing parameters theta_0 and theta_1
theta_0_sgd = np.random.rand(1).astype(np.float32)  # Uniformly distributed between [0, 1)
theta_1_sgd = np.random.randn(1).astype(np.float32) # Normally distributed mean=0, std=1

# Storing gradients for each data point
theta_0_grads = []
theta_1_grads = []

# Looping over each data point
for i in range(n):
    # Single data point (x_i, y_i)
    x_i = x1[i]
    y_i = y[i]

    # Predicted value for the single point
    y_pred_i = theta_1_sgd * x_i + theta_0_sgd

    # Computing the gradient for this single data point
    grad_theta_0_i = -2 * (y_i - y_pred_i)
    grad_theta_1_i = -2 * (y_i - y_pred_i) * x_i

    # Storing the gradients
    theta_0_grads.append(grad_theta_0_i)
    theta_1_grads.append(grad_theta_1_i)

# Calculating  the average stochastic gradient
grad_theta_0_sgd_avg = np.mean(theta_0_grads)
grad_theta_1_sgd_avg = np.mean(theta_1_grads)

# Printing the stochastic gradient
print(f"Average stochastic gradient of theta_0: {grad_theta_0_sgd_avg:.4f}")
print(f"Average stochastic gradient of theta_1: {grad_theta_1_sgd_avg:.4f}")

# Comparing with true gradient from the previous batch gradient computation
print(f"True gradient of theta_0: {grad_theta_0:.4f}")
print(f"True gradient of theta_1: {grad_theta_1:.4f}")

Average stochastic gradient of theta_0: -5.1731
Average stochastic gradient of theta_1: -2.2966
True gradient of theta_0: -5.1731
True gradient of theta_1: -2.2966


* The results obtained using using `numpy` and the chain rule formula are consistent with those derived by `torch.autograd`. This confirms that both methods yield the same gradient calculations, validating the correctness of manual gradient computation through the chain rule as well as the automatic differentiation provided by PyTorch's `autograd` mechanism.