# Creating a Dataset

In [None]:
import torch 
import pandas as pd 
import numpy as np 

from torch.utils.data import Dataset 

class WaterDataset(Dataset): 
    def __init__(self, csv_path): 
        df=pd.read_csv(csv_path)
        self.data = df.to_numpy() 

    def __len__(self):
        return self.data.shape[0] 

    def __getitem__(self, idx): 
        features = self.data[idx, :-1]
        label = self.data[idx, -1]
        return features, label 


# Creating a DataLoader

In [None]:
dataset_train = WaterDataset("../data/water_potability.csv")

In [None]:
from torch.utils.data import DataLoader 

dataloader_train = DataLoader(
    dataset_train, 
    batch_size=2, 
    shuffle=True
)

features, labels = next(iter(dataloader_train))
print(f"features: {features}, \nLabels: {labels}")

# Defining a Net

In [None]:
import torch.nn as nn

class Net(nn.Module): 
    def __init__(self): 
        super(Net, self).__init__() 
        self.fc1 = nn.Linear(9,16)
        self.fc2 = nn.Linear(16,8)
        self.fc3 = nn.Linear(8,1)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x 

net = Net()

# Training Loop 

In [None]:
import torch.nn as nn 
import torch.optim as optim 

criterion = nn.BCELoss() 
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in range(1000): 
    for features, labels in dataloader_train: 
        features = features.float() 
        optimizer.zero_grad() 
        outputs = net(features)
        loss = criterion(
            outputs.double(), labels.view(-1, 1)
        )
        loss.backward()
        optimizer.step()

# Optimizing Weights 

The most used optimazer is **Adam** (Adaptative Momentum) that mixes RMSProp and gradient momentum.

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.01)


# Model Evaluation

In [None]:
# Since I dont know yet how to do the train test split in pytorch.... 
dataloader_test = dataloader_train

In [None]:
from torchmetrics import Accuracy 

acc = Accuracy(task='binary') 

net.eval()

with torch.no_grad(): 
    for features, labels in dataloader_test: 
        outputs = net(features.float()) 
        preds = (outputs >=0.5).float()
        acc(preds, labels.view(-1, 1))

accuracy = acc.compute() 

print(f"Accuracy: {accuracy}")

        

# Unstable Gradients
Neural networks often suffer from gradient instability during training. Sometimes, the gradients become smaller during the backward pass, known as **vanishing gradients**. As a result, earlier layers receive minimal parameter updates, hindering the model's ability to learn. In contrast, gradients may grow increasingly large, leading to massive parameter updates and divergent training, known as **exploding gradients**.

To address these issues, a three-step approach is recommended: proper weights initialization, appropriate activation functions, and batch normalization.

## Weights Initialization
When a torch layer is created, its parameters in the weight attribute are initialized to random values. Research suggests initialization should maintain the variance of inputs and outputs and ensure the variance of gradients is consistent before and after passing through the layer. For ReLU and similar activations, He initialization (also known as Kaiming initialization) is typically used.

### He / Kaiming Initialization
To apply this initialization, call `kaiming_uniform_` from `torch.nn.init` on the layer's weight attribute, ensuring the desired variance properties. In the final layer using sigmoid activation, specify the nonlinearity as sigmoid during initialization.

## Activation Functions
The ReLU (Rectified Linear Unit) is the most commonly used activation function. While efficient, it suffers from the dying neuron problem: neurons output zero for any negative input, effectively dying. The ELU (Exponential Linear Unit) activation improves upon ReLU by allowing non-zero gradients for negative values, reducing the likelihood of vanishing gradients and dying neurons.

## Batch Normalization
Even with proper weights and activations, unstable gradients can still arise during training. Batch normalization addresses this by normalizing a layer's outputs, ensuring the output distribution is roughly normal. It then applies learned scale and shift parameters, allowing the model to learn optimal input distributions for each layer. This stabilizes gradient behavior and accelerates loss convergence.

To implement batch normalization in PyTorch, define the `BatchNorm1d` layer in the model's `__init__` method, matching the preceding layer's output size. Then, pass the linear layer's output to the batch normalization layer before applying the activation function.

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
        # Apply He initialization
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(
            self.fc3.weight, 
            nonlinearity='sigmoid'
        )

    def forward(self, x):
        # Update ReLU activation to ELU
        x = nn.functional.elu(self.fc1(x))
        x = nn.functional.elu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

# Handling Images in Pytorch

In [None]:
from torchvision.datasets import ImageFolder 
from torchvision import transforms 

train_transforms = transforms.Compose([
    transforms.ToTensor(), 
    transforms.Resize((128, 128))
])

dataset_train = ImageFolder(
    "../data/clouds/clouds_train",
    transform = train_transforms
)

In [None]:
dataloader_train = DataLoader(
    dataset_train, 
    shuffle = True,
    batch_size=1
)

image, label = next(iter(dataloader_train))
print(image.shape)


In [None]:
image= image.squeeze().permute(1, 2, 0)
print(image.shape)

In [None]:
import matplotlib.pyplot as plt 
plt.imshow(image)
plt.show()

# Data Augmentation

> When doing data augmentation always keep the data and the task in mind, since sometimes the augmented data could correspond to other classes!!

In [None]:

train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.ToTensor(), 
    transforms.Resize((128, 128))
])

dataset_train = ImageFolder(
    "../data/clouds/clouds_train",
    transform = train_transforms
)

# Convolutional neural networks 

In [None]:
class Net(nn.Module): 
    def __init__(self, num_classes): 
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), 
            nn.ELU(), 
            nn.MaxPool2d(kernel_size=2), 
            nn.Conv2d(32, 64, kernel_size=3, padding=1), 
            nn.ELU(), 
            nn.MaxPool2d(kernel_size=2), 
            nn.Flatten(),
        )
        self.classifier = nn.Linear(64*16*16, num_classes)

    def forward(self, x): 
        x = self.feature_extractor(x)
        x = self.classifier(x)
        return x 



# Training convolutional network

In [None]:
# For multiclass classification we use CrossEntropyLoss

net = Net(num_classes=7)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    running_loss = 0.0
    # Iterate over training batches
    for images, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader_train)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

# Evaluate image classifiers 

In multiclass classification we separate precision and recall for each class.



In [None]:
from torchmetrics import Recall 

recall_per_class = Recall(task='multiclass', num_classes=7, average=None)
recall_micro = Recall(task='multiclass', num_classes=7, average="micro")
recall_macro = Recall(task='multiclass', num_classes=7, average="macro")
recall_weighted = Recall(task='multiclass', num_classes=7, average="weighted")




# Multi-Class Classification Metrics and Averaging Methods

In multi-class classification, where we have more than two classes, calculating evaluation metrics such as **precision**, **recall**, and **F1-score** becomes more complex than in binary classification. Various averaging methods are used to combine per-class metrics into a single overall score. Here’s an overview of the common averaging techniques:

## Micro Averaging
- **Definition**: This method aggregates contributions from all classes to compute the average metric globally.
- **How it works**: It calculates the total true positives (TP), false positives (FP), and false negatives (FN) across all classes and then computes the metrics.
- **Best suited for**: Cases where each class's importance is the same, especially when dealing with imbalanced datasets.
  
  **Formula**:  
  $\text{Precision}_\text{micro} = \frac{\sum_{i} TP_i}{\sum_{i} (TP_i + FP_i)}$  

  $\text{Recall}_\text{micro} = \frac{\sum_{i} TP_i}{\sum_{i} (TP_i + FN_i)}$  

  $\text{F1}_\text{micro} = 2 \cdot \frac{\text{Precision}_\text{micro} \cdot \text{Recall}_\text{micro}}{\text{Precision}_\text{micro} + \text{Recall}_\text{micro}}$

## Macro Averaging
- **Definition**: This method computes the metric independently for each class and then takes the average (unweighted mean).
- **How it works**: Each class is treated equally, regardless of its size.
- **Best suited for**: When all classes are equally important, even if they have very different sample sizes.
  
  **Formula**:  
  $\text{Precision}_\text{macro} = \frac{1}{N} \sum_{i} \text{Precision}_i$  

  $\text{Recall}_\text{macro} = \frac{1}{N} \sum_{i} \text{Recall}_i$  

  $\text{F1}_\text{macro} = \frac{1}{N} \sum_{i} \text{F1}_i$
  
  where $N$ is the number of classes.

## Weighted Averaging
- **Definition**: This method computes the metric for each class independently, but the average is weighted by the number of instances of each class.
- **How it works**: Classes with more samples have a larger impact on the overall score.
- **Best suited for**: Imbalanced datasets, where the model should perform better on larger classes.

  **Formula**:  
  $\text{Precision}_\text{weighted} = \sum_{i} \frac{n_i}{n} \cdot \text{Precision}_i$  

  $\text{Recall}_\text{weighted} = \sum_{i} \frac{n_i}{n} \cdot \text{Recall}_i$  

  $\text{F1}_\text{weighted} = \sum_{i} \frac{n_i}{n} \cdot \text{F1}_i$
  
  where $n_i$ is the number of samples in class $i$, and $n$ is the total number of samples across all classes.

## Per-Class Metrics (No Averaging)
- **Definition**: In some cases, you may want to evaluate each class independently, without averaging metrics across classes. This can be useful in highly imbalanced datasets or when certain classes are more important.

In [None]:
from torchmetrics import Precision, Recall 

metric_precision = Precision(
    task="multiclass", num_classes=7, average="macro"
)
metric_recall = Recall(
    task="multiclass", num_classes=7, average="macro"
)

net.eval() 
with torch.no_grad(): 
    for images, labels in dataloader_test: 
        outputs = net(images.float())
        _, preds = torch.max(outputs,1)
        metric_precision(pred, labels)
        metric_recall(preds, labels)

precision = metric_precision.compute()
recall = metric_recall.compute()

In [None]:
# TODO: Fix these last cells.