# Neural Network with Logistic Regressio, from shallow to deep

<a name='1'></a>

Let's build a shallow neural network (NN), i.e., without hidden layer.

We will build it from scrath and use skilearn only to deal with data split and normalization.

In the end, we will build the same NN using PyTorch and Tensorflow.


**Shallow Neural Network representation**

<img src="Images_Notebooks/ShallowNeuralNet.png" style="width:650px;height:400px;">


**Key steps**:
    - Initialize the weights (same dimension as the input data, i.e, number of features)\
    - Apply linear transformation: $Z = W\cdot X + b$, where W= weights, X=input data and b = bias \
    - Pass Z through a sigmoid activation function $g(z) = \sigma(Z) = \frac{1}{1 + exp^{-Z}}$\
    - Compute the Cost Function $C = \frac{-1}{m} \sum_{i=1}^{m} [Y log(A) - (1-Y)log(1-A)]$, where Y is the real label of the data\
    - Compute Gradient Descent (GD) and update the weights.\
    - Repeat the process as many times wanted (# of iterations).

##### RESOURCES:

1) Coursera: Neural Networks and Deep Learning\
2) https://www.youtube.com/watch?v=w8yWXqWQYmU&t=664s&ab_channel=SamsonZhang \
3) https://medium.com/@jacobbumgarner/breaking-it-down-logistic-regression-e5c3f1450bd#cee3

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
class LogisticRegression:
    """
    Description:
        Logistic Regression
    Args:
        n_input_features (int): # of features in the dataset
    Attirbutes:
        weights (np.ndarray)
        bias (float)
        fit (bool): Whether the model has been fit or not to training data. Default: False
    
    """
    
    def __init__(self, n_input_features: int):
        """
        Description:
            Initialize weights (W) and bias (b)    
        """
        self.weights = np.random.randn(n_input_features,1)*0.01 # Do not start with zeros. Font: Andrew NG course
        self.bias = np.zeros((1,1))
        
        self.fit = False # indicates the training state of the classifier
        
    
    def linear_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Description:
            Linear component
            Z = W X + b
        
        Args:
            X (np.ndarray): Input data
        
        Returns:
            np.ndarray: transformed data Z
        
        W -> weights
        b -> bias
        X -> Input data
        """
        return np.matmul(X, self.weights) + self.bias # np.matmul = matrix multiplication line vs colunm
    
    def sigmoid(self, Z: np.ndarray) -> np.ndarray:
        """
        Description:
            Sigmoid function
            sigma(z) = 1 / (1 + exp(-z))
        
        Args:
            Z (np.ndarray): Linear transformed data
        
        Returns:
            np.ndarray: Data evaluated in a sigmoid function
        """
        return 1. / (1. + np.exp(-Z))
        
    
    def cost_cross_entropy(self, A: np.ndarray, Y: np.ndarray) -> float:
        """
        Description:
            Cross-Entropy Cost Function
            L(Y,A) = (1/m) \sum_n [(-Y log(A)) - (1-Y)(log(1-A))]
            
        Args:
            Y (np.ndarray): true label of the data
            A (np.ndarray): label "probability" 
        """
        m = Y.shape[0]
        epsilon = 1e-6
        
        cost = (-1/m) * np.sum( Y*np.log(A)  + \
                (1 - Y) * np.log(1-A+epsilon))
        
        # cost = np.squeeze(cost) make sure cost is in the correct shape (turn [[1]] into 1)
        return np.squeeze(cost)
    
    def gradient_descent(self, X: np.ndarray, A: np.ndarray, Y: np.ndarray,
                        learning_rate = 0.01) -> None:
        """
        Description:
            Compute the Gradient Descent
            dZ = (A - Y)
            dW = dZ . X 
            dB = dZ
                and update
            W = W - dW * learning_rate
            b = b - dB * learning_rate
        """
        m = A.shape[0]
        oneover_m = 1./m
        
        dZ = (A - Y)
        dW = oneover_m * np.sum(dZ * X, axis=0, keepdims=True).T
        dB = oneover_m * np.sum(dZ, axis=0, keepdims=True).T
        
        # Update
        self.weights -= dW * learning_rate
        self.bias -= dB * learning_rate
        
        return
        
        
    def train(self, X: np.ndarray, Y: np.ndarray,
             epochs: int = 100, learning_rate: float = 0.01, batch_size: int = 10,
             verbose: bool = False) -> np.ndarray:
        
        """
        Description:
            Fit the logistic regression model to training data
            Use minibatch GD.
            
        Args:
            X (np.ndarray): Training dataset
            Y (np.ndarray): Training targets
            epochs (int, optional; default = 100): Number of iterations
            learning_rate (float, optional; default = 0.01): Learning rate step size
            batch_size (int, optional; default = 10): Size of batch for GD
            verbose (bool, optional; default = False): __description__
        
        Raises:
            Attribute: Raises error if the model is already fitted
            ValueError: Raises error if the number of features dosen't match the instantiated feature count.
            
        Returns:
            np.ndarray: The cost history
        
        """
        # Raise flags
        if self.fit:
            raise AttributeError("Error: Model already fitted")
        self.fit = True
        
        if not X.shape[-1] == self.weights.shape[0]:
            raise ValueError("Shape of X is different from Weights")
        
        if Y.ndim == 1:
            Y = np.expand_dims(Y, axis=1)
        
        # Fit the model
        cost_hist = []
        accuracies = []
        weight_hist = []
        bias_hist = []
        
        for _ in range(epochs):
            weight_hist.append(self.weights[:,0].copy())
            bias_hist.append(self.bias.copy())
            
            
            if batch_size:
                batch_indices = np.random.choice(
                    X.shape[0], size = batch_size, replace = False
                )
                X_batch, Y_batch = X[batch_indices], Y[batch_indices]
            else:
                X_batch, Y_batch = X, Y
                
            
            # Linear Transformation
            Z = self.linear_transform(X_batch)
            
            # Sigmoid activation
            A = self.sigmoid(Z)
            
            # Cost function (Cross- Entropy)
            cost = self.cost_cross_entropy(A, Y_batch)
            
            # Perform GD
            self.gradient_descent(X_batch, A, Y_batch, learning_rate = learning_rate)
            
            if verbose:
                print(f'Epoch: {_}, Cost: {cost: 0.3f}                  ', end='\r')
                
            cost_hist.append(cost)
            accuracies.append(self.accuracy(self.predict(X), Y[:,0]))
        
        if verbose:
            print(f'Final cost: {cost:0.2f}                  ')
            
        self.fit = True
        
        return np.array(cost_hist), np.array(accuracies), np.array(weight_hist).T, np.array(bias_hist).T[0,0]
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """
        Description:
            Predict the labels
            
        Args:
            X (np.ndarray): Data for predictions
            
        Returns:
            np.ndarray: Prediction for each sample
        """
        if not self.fit:
            raise AttributeError("Error: This classifier is not trained")
        
        Z = self.linear_transform(X)
        A = self.sigmoid(Z)
        
        return A.T[0]
    
    def accuracy(self, predictions: np.ndarray, labels: np.ndarray) -> float:
        """
        Description:
            Accuracy of prediction
            
        Args:
            predictions (np.ndarray): predict (function) output
            labels (np.ndarray): True labels
            
        Returns:
            float: prediction accuracy
        """
        
        overlap = (predictions >= 0.5) == labels
        accuracy = (overlap.sum() / predictions.shape[0]) * 100 # convert to probability
        
        return accuracy

Load data and standarize features

In [28]:
# Load Data
df = pd.read_csv("Dataset/heart.csv")
df.target = df.target.replace({0: 1, 1: 0}) # Target are 0=yes and 1=no, let's change
targets = df.pop("target")


# Split data into Training and Test
x_train, x_test, y_train, y_test = train_test_split(
    df, targets, test_size = 0.25, random_state = 42
)

# Feature scaling
features_to_standardize = ["age", "trestbps", "chol", "thalach", "oldpeak"]

column_transformer = ColumnTransformer(
    [("scaler", StandardScaler(), features_to_standardize)], remainder="passthrough"
)

x_train = column_transformer.fit_transform(x_train)
x_test = column_transformer.fit_transform(x_test)

Training and testing the model

In [None]:
model = LogisticRegression(n_input_features=x_train.shape[-1])

costs, accuracies, weights, bias = model.train(x_train, y_train,
                                              epochs = 5000,
                                              learning_rate=0.01,
                                              batch_size=None,
                                              verbose=False)

predictions = model.predict(x_test)

accuracy = model.accuracy(predictions, y_test)

print(f"Model test accuracy: {accuracy:0.2f}%")

## Same thing using Tensorflow 

In [None]:
import tensorflow as tf

In [None]:
# Create dataset
x_train2 = tf.convert_to_tensor(x_train)
y_train2 = tf.convert_to_tensor(y_train)
x_test2 = tf.convert_to_tensor(x_test)
y_test2 = tf.convert_to_tensor(y_test)

train_dataset = tf.data.Dataset.from_tensors((x_train2, y_train2)) #.batch(32)
test_dataset = tf.data.Dataset.from_tensors((x_test2, y_test2)) #.batch(1)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(
        1,
        kernel_initializer = "HeNormal",
        bias_initializer = "zeros",
        activation = "sigmoid"
    )
])

model.compile(loss = "bce", metrics = ['accuracy'])

model.fit(train_dataset, epochs = 5000, verbose = 0)

results = model.evaluate(test_dataset)

print(f"Model test accuracy: {results}%")

## Same thing using PyTorch

In [41]:
import torch
from torch import from_numpy
import torch.nn as nn

from tqdm import tqdm

from sklearn.metrics import accuracy_score

In [53]:
# n_input_features = x_train.shape[-1]

x_train2, x_test2 = torch.from_numpy(x_train), torch.from_numpy(x_test)
y_train2, y_test2 = torch.from_numpy((y_train.astype(float)).to_numpy()), torch.from_numpy((y_test.astype(float)).to_numpy())

In [55]:
class ShallowNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ShallowNN, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        output = torch.sigmoid(self.linear(x))
        return output

In [69]:
n_epochs = 5000
input_dim = n_input_features
output_dim = 1
learning_rate = 0.01

model = ShallowNN(input_dim, output_dim)

criterion = torch.nn.BCELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [72]:
for epochs in tqdm(range(int(n_epochs)), desc = 'Training Epochs'):
    x = x_train2
    labels = y_train2
    optimizer.zero_grad()
    outputs = model(x_train2.float())
    loss = criterion(torch.squeeze(outputs.float()), labels.float())
    
    loss.backward()
    
    optimizer.step()

Training Epochs: 100%|████████████████████| 5000/5000 [00:01<00:00, 3285.66it/s]
