In [None]:
!pip install "cleanvision[huggingface]"
!pip install pyod

In [None]:
# Import PyTorch
import torch
from torch import nn

# import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets
from cleanvision import Imagelab
from PIL import Image
import requests
import torch
import cv2
import imagehash


# Import torchvision
import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader


# Import matplotlib for visualization
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Dataset

In [None]:
# prompt: download fashion mnist from torch vision

# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

# add transformations to above so that it lies between 0 and 1

In [None]:
# Plot more images
torch.manual_seed(42)
fig = plt.figure(figsize=(9, 9))
rows, cols = 4, 4
for i in range(1, rows * cols + 1):
    random_idx = torch.randint(0, len(train_data), size=[1]).item()
    img, label = train_data[random_idx]
    fig.add_subplot(rows, cols, i)
    plt.imshow(img.squeeze(), cmap="gray")
    plt.title(train_data.classes[label])
    plt.axis(False);

In [None]:

train_dataloader = DataLoader(train_data, batch_size=32,shuffle=True)
train_features_batch, train_labels_batch = next(iter(train_dataloader))

print(f"Features Batch Size {train_features_batch.shape}")
print(f"Labels Batch Size {train_labels_batch.shape}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of 32")
     

In [None]:

test_dataloader = DataLoader(test_data, batch_size=32,shuffle=False)
test_features_batch, test_labels_batch = next(iter(test_dataloader))

print(f"Features Batch Size {test_features_batch.shape}")
print(f"Labels Batch Size {test_labels_batch.shape}")
print(f"Length of train dataloader: {len(test_dataloader)} batches of 32")

In [None]:

def data_cleaner(ims):

    imagelab = Imagelab(hf_dataset=ims, image_key="image")
    issue_df = imagelab.issues

    # handling images with low information
    images_with_low_information = issue_df[issue_df['is_low_information_issue'] == True].reset_index()[['index','low_information_score']]
    img_index_with_low_information = images_with_low_information['index']


    # handling dark images
    dark_images = issue_df[issue_df['is_dark_issue'] == True].reset_index()[['index','dark_score']]
    dark_img_index = dark_images['index']


    # handling duplicate images
    duplicate_images = issue_df[issue_df['is_near_duplicates_issue'] == True].reset_index()[['index','near_duplicates_score']]
    duplicate_images_index = []
    for i in imagelab.info["near_duplicates"]["sets"]:
    for j,counter in zip(i,range(0,len(i))):
        if counter == 0:
        continue
        else:
        duplicate_images_index.append(j)

    # appending all the images to be removed from dataset
    img_index_to_remove = list(set(list(img_index_with_low_information) + list(dark_img_index) + list(duplicate_images_index)))

    return img_index_to_remove

In [None]:
dataset_dict = load_dataset("zalando-datasets/fashion_mnist")

# train dataset
train_data = dataset_dict['train']
train_images = np.array(train_data['image'])
train_labels = np.array(train_data['label'])

# test dataset
test_data = dataset_dict['test']
test_images = np.array(test_data['image'])
test_labels = np.array(test_data['label'])

In [None]:
to_remove = data_cleaner(train_data)


# new image list
reduced_train_images = []
reduced_train_labels = []

for counter, im, label in zip(range(len(train_images)), train_images, train_labels):
    if counter in img_index_to_remove:
        continue
    else:
        reduced_train_images.append(im)
        reduced_train_labels.append(label)

# Modelling and Training FFNN

In [None]:
import torch
from torch import nn

class FFNN_Model(nn.Module):

    def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
        super().__init__()
        self.layer_stack = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=input_shape, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_shape),
            nn.ReLU()
        )

    def forward(self, x: torch.Tensor):
        return self.layer_stack(x)

In [None]:
torch.manual_seed(42)
model = FFNN_Model(input_shape=784, hidden_units=10, output_shape=10 ).to(device)
     

In [None]:
next(model.parameters()).device # check model device

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model.parameters(),lr=0.1)

In [None]:
def accuracy_fn(y_true, y_pred):
    return (torch.eq(y_true, y_pred).sum().item() / len(y_pred)) * 100
     

In [None]:
torch.manual_seed(42)
for epoch in tqdm(range(3)): # running for 3 epochs

    print(f"Epoch: {epoch}\n-------")

    '''
    TRAINING
    '''
    train_loss, train_acc = 0, 0
    model.train()
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        y_pred = model(X) #Forward pass

        loss = loss_fn(y_pred, y) # Calculate loss (per batch)
        acc = accuracy_fn(y_true=y,y_pred=y_pred.argmax(dim=1))

        train_loss += loss # accumulatively add up the loss per epoch
        train_acc += acc # Go from logits -> pred labels

        optimizer.zero_grad() # Optimizer zero grad
        loss.backward() # Loss backward
        optimizer.step() # Optimizer step

        # Print out how many samples have been seen
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(train_dataloader.dataset)} samples")

    # Divide total train loss by length of train dataloader (average loss per batch per epoch)
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    print(f"Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%")

    '''
    TESTING
    '''
    test_loss, test_acc = 0, 0
    model.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)
            test_pred = model(X) # Forward pass

            loss = loss_fn(test_pred, y)
            acc = accuracy_fn(y_true=y,y_pred=test_pred.argmax(dim=1))

            test_loss += loss
            test_acc += acc

        # Calculations on test metrics need to happen inside torch.inference_mode()
        test_loss /= len(test_dataloader)
        test_acc /= len(test_dataloader)

    print(f"Test loss: {test_loss:.5f}  | Test acc: {test_acc:.2f}%\n")

# Modelling and Training ResNet

# Modelling and Training Vision Transformers