[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/khetansarvesh/Tabular-Cross-Sectional-Modelling/blob/main/modelling/regression/ANN.ipynb)

In [None]:
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# **Using Pytorch Library**

## Data Processing

In [None]:
x = torch.randn(10, 5) # creating sample dataset with just 5 independent features - x1,x2,x3,x4,x5 and 2 rows
x

tensor([[ 4.0891e-01, -4.4023e-01, -8.5300e-01,  9.7805e-01,  5.6652e-02],
        [ 5.5181e-01, -1.3340e+00,  1.2891e+00,  9.7943e-01,  1.3466e-01],
        [-1.6634e+00, -1.0699e+00, -6.9859e-01, -1.6662e+00,  1.3111e+00],
        [-8.4231e-02,  2.6872e-01,  6.7192e-01, -1.3386e+00,  5.7466e-01],
        [ 8.7959e-01, -2.2094e+00,  9.3218e-01,  2.1339e-01, -5.6719e-01],
        [ 8.3101e-01, -2.9186e-01,  6.3879e-01, -7.6856e-01,  1.0273e+00],
        [ 8.6117e-01, -1.3429e+00,  6.0611e-02,  9.8263e-02, -2.2480e-01],
        [-5.7655e-01, -1.7413e-01, -4.1327e-01, -7.0351e-01, -1.1554e+00],
        [ 3.0365e-02, -2.2806e+00, -6.8178e-01, -9.2001e-01, -9.4490e-01],
        [-6.3160e-01, -1.0600e+00, -2.4375e+00,  2.1277e-03, -3.5008e-01]])

In [None]:
# Create the y data
y = torch.randn(10, 1)
y

tensor([[ 0.1520],
        [-2.3906],
        [ 0.7636],
        [-0.4156],
        [-0.3451],
        [ 0.6496],
        [-1.1991],
        [ 0.5058],
        [-1.3132],
        [-0.1949]])

## Modelling

In [None]:
# creating a one hidden layer FFNN for regression
class SarveshANN(nn.Module):

  def __init__(self, input_size = 5, hidden_size = 10, output_size = 1): # 5 cause we have 5 features, 10 hidden units in hiddlen layer1 and just 1 output unit
    super(SarveshANN, self).__init__()
    self.model = nn.Sequential(
        nn.Linear(input_size, hidden_size),nn.ReLU(),
        nn.Linear(hidden_size, output_size),nn.Identity() # nn.Sigmoid() and nn.LeakyReLU()
    )

  def forward(self, x):
    return self.model(x)

#Here is an alternative way to define the same class. You can see that we can replace nn.Sequential by defining the individual layers in the __init__ method and connecting the in the forward function.
#class MultilayerPerceptron(nn.Module):

#  def __init__(self, input_size = 5, hidden_size = 10, output_size = 1):
#    super(MultilayerPerceptron, self).__init__()
#    self.linear = nn.Linear(input_size, hidden_size)
#    self.relu = nn.ReLU()
#    self.linear2 = nn.Linear(hidden_size, output_size)
#    self.sigmoid = nn.Sigmoid()

#  def forward(self, x):
#    linear = self.linear(x)
#    relu = self.relu(linear)
#    linear2 = self.linear2(relu)
#    output = self.sigmoid(linear2)
#    return output

In [None]:
model = SarveshANN()
print(model)

SarveshANN(
  (model): Sequential(
    (0): Linear(in_features=5, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=5, bias=True)
    (3): ReLU()
    (4): Linear(in_features=5, out_features=1, bias=True)
    (5): Identity()
  )
)


In [None]:
list(model.named_parameters()) #alternative to this is model.parameters() function -> these function gives the initial random parameters the model is taking

[('model.0.weight', Parameter containing:
  tensor([[-0.2559,  0.2649, -0.0537, -0.0286,  0.3346],
          [-0.0139,  0.4457,  0.3879,  0.4450, -0.3163],
          [ 0.2954,  0.3767,  0.4416,  0.1291, -0.0380],
          [-0.1830, -0.0354,  0.2354,  0.3366,  0.3842],
          [-0.3997, -0.2525, -0.3182,  0.2405,  0.3490],
          [-0.3522,  0.0324,  0.1795, -0.0540, -0.0234],
          [ 0.3768, -0.0812,  0.3992,  0.3240,  0.4382],
          [ 0.2240,  0.3083,  0.1131,  0.2188,  0.1478],
          [ 0.1752,  0.2188, -0.0049, -0.3777,  0.4376],
          [-0.2581,  0.0958,  0.3011, -0.3008,  0.2002]], requires_grad=True)),
 ('model.0.bias', Parameter containing:
  tensor([-0.2308,  0.1004, -0.3939,  0.1638, -0.0436, -0.1687, -0.0675,  0.2156,
           0.4307, -0.4149], requires_grad=True)),
 ('model.2.weight', Parameter containing:
  tensor([[-0.2661, -0.0350,  0.3118,  0.0334,  0.0921,  0.0074,  0.2894, -0.0901,
            0.0221,  0.2854],
          [-0.2084, -0.1682, -0.2330,

## Training

In [None]:
# Define the optimizer
import torch.optim as optim
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.MSELoss()

In [None]:
# training for 10 epochs
for epoch in range(10):

  #Set the gradients to 0
  adam.zero_grad()

  #forward propagation
  y_pred = model(x)
  loss = loss_function(y_pred, y)
  print(f"Epoch {epoch}: traing loss: {loss}")

  #backward propagation to compute the gradients
  loss.backward()

  #Updating weights - Take a step to optimize the weights
  adam.step()


Epoch 0: traing loss: 0.46382957696914673
Epoch 1: traing loss: 0.41810521483421326
Epoch 2: traing loss: 0.33339884877204895
Epoch 3: traing loss: 0.2539175748825073
Epoch 4: traing loss: 0.2533775269985199
Epoch 5: traing loss: 0.18649843335151672
Epoch 6: traing loss: 0.14389298856258392
Epoch 7: traing loss: 0.10955234616994858
Epoch 8: traing loss: 0.07010544836521149
Epoch 9: traing loss: 0.05134958028793335


In [None]:
list(model.parameters()) #parameters learnt after training

[Parameter containing:
 tensor([[ 0.0919,  0.4043,  0.0661, -0.2225,  0.8393],
         [-0.1179,  1.2639,  0.4499, -0.0146, -0.3028],
         [ 0.0106,  0.4343,  0.2046, -0.2028,  0.1867],
         [ 0.0155, -0.7481,  0.4118, -0.2486, -0.0780],
         [-0.4846, -0.0452, -0.1402,  0.1874,  0.0923],
         [-0.0870, -0.1877, -0.2299, -0.1982,  0.5085],
         [-0.1209,  0.0378,  0.5342,  0.3766,  0.1330],
         [ 0.1514,  0.9357,  0.3862, -0.3541,  0.0459],
         [ 0.4894, -0.0513,  0.0158, -0.6740,  0.0521],
         [-0.3115, -0.4785,  0.4461, -0.8579,  0.1322]], requires_grad=True),
 Parameter containing:
 tensor([-0.0245,  0.6163, -0.4797,  0.7681, -0.2806, -0.5909, -0.3918,  0.6035,
          0.5538, -0.0950], requires_grad=True),
 Parameter containing:
 tensor([[-0.8311, -0.2224,  0.0213, -0.0971, -0.0278,  0.3614,  0.2324, -0.2281,
          -0.5414,  0.0140],
         [-0.2084, -0.1682, -0.2330, -0.1889, -0.2143, -0.1303, -0.1252, -0.1246,
          -0.3036,  0.0225

## Inference

In [None]:
# See how our model performs on the training data
y_pred = model(x)
y_pred

tensor([[ 0.9180],
        [-0.5824],
        [-0.5866],
        [ 0.2653],
        [ 0.9249],
        [ 0.0908],
        [ 1.2653],
        [-1.4679],
        [-0.4246],
        [ 0.1427]], grad_fn=<AddmmBackward>)

# **Without Using Pytorch Library**

## Data Preprocessing

In [None]:
  # Generate synthetic data
  X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=42)

  # Reshape y to be column vector
  y = y.reshape(-1, 1)

  # Split the data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modelling

In [None]:
class SingleHiddenLayerNN:

    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # Initialize weights and biases with small random values
        # W1: weights from input to hidden layer (input_size x hidden_size)
        # b1: biases for hidden layer (hidden_size x 1)
        # W2: weights from hidden to output layer (hidden_size x output_size)
        # b2: biases for output layer (output_size x 1)

        self.W1 = np.random.randn(self.input_size, self.hidden_size) * 0.1
        self.b1 = np.zeros((1, self.hidden_size))
        self.W2 = np.random.randn(self.hidden_size, self.output_size) * 0.1
        self.b2 = np.zeros((1, self.output_size))

        # Store gradients for analysis
        self.gradients = {}

    def sigmoid(self, z):
        """Sigmoid activation function"""
        z = np.clip(z, -500, 500) # Clip z to prevent overflow
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, a):
        """Derivative of sigmoid function"""
        return a * (1 - a)

    def forward_pass(self, X):
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.sigmoid(self.z2)
        return self.a2

    def compute_loss(self, y_true, y_pred):
        """Compute binary cross-entropy loss"""
        m = y_true.shape[0]
        # Clip predictions to prevent log(0)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) / m
        return loss

    def backward_pass(self, X, y_true, y_pred):
        m = X.shape[0]  # Number of training examples

        # Output layer gradients
        # dL/dz2 = a2 - y (for sigmoid + binary cross-entropy)
        dz2 = y_pred - y_true

        # Gradients for W2 and b2
        # dL/dW2 = (1/m) * a1^T @ dz2
        # dL/db2 = (1/m) * sum(dz2, axis=0)
        dW2 = np.dot(self.a1.T, dz2) / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m

        # Hidden layer gradients
        # dL/da1 = dz2 @ W2^T
        da1 = np.dot(dz2, self.W2.T)

        # dL/dz1 = dL/da1 * sigmoid'(z1) = da1 * a1 * (1 - a1)
        dz1 = da1 * self.sigmoid_derivative(self.a1)

        # Gradients for W1 and b1
        # dL/dW1 = (1/m) * X^T @ dz1
        # dL/db1 = (1/m) * sum(dz1, axis=0)
        dW1 = np.dot(X.T, dz1) / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m

        # Store gradients
        self.gradients = {
            'dW2': dW2, 'db2': db2,
            'dW1': dW1, 'db1': db1
        }

        return dW1, db1, dW2, db2

    def update_weights(self):
        """Update weights using gradient descent"""
        self.W1 -= self.learning_rate * self.gradients['dW1']
        self.b1 -= self.learning_rate * self.gradients['db1']
        self.W2 -= self.learning_rate * self.gradients['dW2']
        self.b2 -= self.learning_rate * self.gradients['db2']

    def train(self, X, y, epochs=1000, verbose=True):
        """Train the neural network"""
        losses = []

        for epoch in range(epochs):
            # Forward pass
            y_pred = self.forward_pass(X)

            # Compute loss
            loss = self.compute_loss(y, y_pred)
            losses.append(loss)

            # Backward pass
            self.backward_pass(X, y, y_pred)

            # Update weights
            self.update_weights()

            # Print progress
            if verbose and epoch % 100 == 0:
                accuracy = self.compute_accuracy(y, y_pred)
                print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

        return losses

    def predict(self, X):
        """Make predictions"""
        return self.forward_pass(X)

    def predict_binary(self, X):
        """Make binary predictions (0 or 1)"""
        probabilities = self.predict(X)
        return (probabilities > 0.5).astype(int)

    def compute_accuracy(self, y_true, y_pred):
        """Compute classification accuracy"""
        binary_pred = (y_pred > 0.5).astype(int)
        return np.mean(binary_pred == y_true)

## Training

In [None]:
nn = SingleHiddenLayerNN(input_size=2, hidden_size=4, output_size=1, learning_rate=0.1)

# Train the model
losses = nn.train(X_train, y_train, epochs=1000, verbose=True)

## Inference

In [None]:
def plot_decision_boundary(model, X, y):
    """Plot the decision boundary of the model"""
    h = 0.1
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))

    mesh_points = np.c_[xx.ravel(), yy.ravel()]
    Z = model.predict(mesh_points)
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, levels=50, alpha=0.8, cmap=plt.cm.RdYlBu)
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y.ravel(), cmap=plt.cm.RdYlBu, edgecolors='black')
    plt.colorbar(scatter)

In [None]:
# Test the model
train_pred = nn.predict(X_train)
test_pred = nn.predict(X_test)

train_accuracy = nn.compute_accuracy(y_train, train_pred)
test_accuracy = nn.compute_accuracy(y_test, test_pred)

print(f"\nFinal Results:")
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Plot training loss
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)

# Plot decision boundary
plt.subplot(1, 2, 2)
plot_decision_boundary(nn, X_test, y_test)
plt.title('Decision Boundary')

plt.tight_layout()
plt.show()