# Sprint 3


## Creating a custom dataset class

This class will process our images. When we load the dataset we give in a custom transformer to normalize, resize and transform to a tensor.


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

from skimage import io
import numpy as np
from os import listdir
from os.path import isfile, join

from fastai.vision.all import PILImage
import matplotlib.pyplot as plt


# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FoodDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.files = [f for f in listdir(root_dir) if isfile(join(root_dir, f))]


    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.files[idx])

        image = PILImage.create(img_name)

        if self.transform:
            sample = self.transform(image)
        print("processed image")
        return sample


IMG_HEIGHT = 128
IMG_WIDTH = 128
# Load data
def transform(img):
    img_resized = img.resize((IMG_HEIGHT,IMG_WIDTH))
    img_np = np.array(img_resized).flatten()
    img_np = img_np/255
    x_np = torch.from_numpy(img_np)
    return x_np


dataset = FoodDataset(root_dir='tripadvisor_dataset/tripadvisor_mini', transform=transform)

## set the dataloader

DataLoader is een iterable dat de complexity van minibatches en mulitprocessing om de data retrieval te versnellen, abstraheert in een eenvoudige API. We stellen batch size in op 4 en stellen ... worker threads in.
TODO: uitleggen en aanpassen naar DCN algoritme



In [None]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)
train_features= next(iter(dataloader))

Example of a processed image


In [None]:
print(f"Feature batch shape: {train_features.size()}")


img = train_features[0].squeeze().reshape(IMG_HEIGHT, IMG_WIDTH, 3)
plt.imshow(img, cmap="gray")
plt.show()


## Deep Clustering with Convolutional Autoencoders (DCEC)