In [None]:
# Now that you know how to read in images, get their
# dimensions and visualize them, let's move on to working on
# our PyTorch dataset and COMPLETE THER EST

In [None]:
# our FIRST step before we even get to building PyTorch objects
# is actually setting up a csv file that maps the file paths to
# their label and their category (i.e. train, val, test)

# one thing to consider here is the directory structure of chest_xray.
# within the chest_xray dir, there are three directories (train, val, test)
# and within those, there are two subdirectories (NORMAL, PNEUMONIA)

# scroll down to the __getitem___ part of our custom dataset and think
# about how you wanna handle your file paths before you construct your
# csv files!

# you might want to create 3 csv files (one for train, one for val, one
# for test) or you might have to just create 1 mega csv!
# totally up to you. Whatever you decide, you will have to adjust
# how you initialize your Dataset paramaeter (i.e. do you pass in the 
# path to the csv file? or a subset of a pandas df?)

# i'm gonna let you work some Pandas magic on your own! ;)

In [None]:
# Now, that we got our csv file(s) and our images, we're finally
# ready to interact with PyTorch!

# At minimum, we'll implement three objects:
# - a custom Dataset object (PyTorch object)
# - your image transformations (augmentations!)
# - a DataLoader

# OPTIONAL: you can also implement 
# - a custom DataModule (PyTorch Lightning object)
#     * DataModule will use the DataLoader but the Module will
#     * nicely encapsulate other things like data augmentation!
#     * it's not a lot of work so I recommend it! It will make your life
#     * for checkpoint #2!

In [4]:
import os
import pandas as pd

from torch.utils import data
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from torchvision.io import read_image

In [13]:
# in implemting a custom PyTorch dataset you must implement
# three methods: __init__, __len__, and __getitem__

# a Dataset is indexable, which makes allows you index (duh)
# into your dataset, but also gives random access for 
# shuffling (which you can do yourself, or use a DataLoader, or
# use a DataModule)

# for more help: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files

class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir_path, transform=None):
        """
        Inputs:
            - csv_file (str): file path to the csv file
            - img_dir_path: directory path to the images
            - transform: Compose (a PyTorch Class) that strings together several
              transform functions (e.g. data augmentation steps)
        """
        self.img_labels = pd.read_csv(csv_file, skiprows=1, header=None)
        self.img_dir = img_dir_path
        self.transform = transform

    def __len__(self):
        """
        Returns: (int) length of your dataset
        """
        return len(self.img_labels)

    def __getitem__(self, idx):
        """
        Loads and returns your sample (the image and the label) at the
        specified index

        Parameter: idx (int): index of interest

        Returns: image, label
        """
        img_path =  os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        
        # read the image
        image = read_image(img_path)

        # get the label
        label = self.img_labels.iloc[idx, 1]

        # if you are transforming your image (i.e. you're dealing with training data),
        # you would do that here!
        if self.transform:
            image = self.transform(image)

        return image, label
    
    def count_sizes(self)

In [33]:
layer, height, width = train_data[0][0].shape


1

Once we implemented the Class, we create one object per dataset:

In [18]:
train_data = CustomImageDataset(csv_file = '../data/data_train.csv',img_dir_path = '../data')
val_data = CustomImageDataset(csv_file = '../data/data_val.csv',img_dir_path = '../data')
test_data = CustomImageDataset(csv_file = '../data/data_test.csv',img_dir_path = '../data')

The labels for each one of the datasets are the following:

In [25]:
# Train dataset
train_data.img_labels.iloc[:,1].value_counts()

1    3875
0    1341
Name: 1, dtype: int64

In [26]:
# Validation dataset
val_data.img_labels.iloc[:,1].value_counts()

0    8
1    8
Name: 1, dtype: int64

In [27]:
# Test dataset
test_data.img_labels.iloc[:,1].value_counts()

1    390
0    234
Name: 1, dtype: int64

In [17]:
len(obj)

5216

In [None]:
# implementing your transformations

# for additional help: https://pytorch.org/vision/stable/generated/torchvision.transforms.Compose.html#torchvision.transforms.Compose

# for a list of transformations illustrated: 
# https://pytorch.org/vision/stable/auto_examples/plot_transforms.html#sphx-glr-auto-examples-plot-transforms-py

# NOTE: per PyTorch, 
"""
Most transformations accept both PIL images and tensor images,
although some transformations are PIL-only and some are tensor-only.
"""

# therefore, you will have to experiment with what works with your image!
# or read the documentation!


import torchvision.transforms as T

transforms = T.Compose(
    [
        # if you are using cv2, you will need to FIRST convert
        # the image to a Tensor. if you are using PIL to read in your image
        # you will need to convert uit to a Tensor eventually!
     
        T.RandomAdjustSharpness(sharpness_factor=2),
        T.RandomPosterize(bits=4, p=0.5)
        # this is just an example. I randomly selected some.
        # Choose your own more carefully! :)
        # you will definitely want to RESIZE your image! if you don't want to
        # decide on a size yet, you can return to resizing for checkpoint #2 when
        # we implement our custom model!

        # if you are normalizing your image, you will need to consider two things
     
        # 1) if you are using a pre-trained model, you will use the mean and SD
        # of the data that model was trained on
        # (this does not apply to us yet or ever, depending on what you do for the
        # final part of the project)

        # 2) if you are using a custom model, you will need to normalize your images
        # based on the mean and SD of your TRAINING data
        # and you will have to normalize your validation and your test data, too
        # using your training data's mean and SD
     
        # this brings us to another good point to consider -- you might have to
        # prepare two transformations for your datasets 
        # your augmentation transform for your training data, which will normalize
        # (if you're doing that), distort and resize the image
        # one that just normalizes (if you're doing that) and resizes
        # your validation and testing data
     
        # LOTS TO THINK ABOUT <3 
    ]
)

In [None]:
# building your DataLoaders

# here we can more directly see how the dataset interacts wiht the dataloader
# the dataloader really is just an iterator. it's not very "smart"
# it's not going to give you validation data just because you named it val_dataloader
# so the distinction will need to happen when you create val_data, which means that
# you need to either build in an mechanism internally in the CustomImageDataset to 
# parse for `valid` category inside the csv (if you are just providing the file path
# to a mega csv) or you can pass it an already subsetted csv or df!

training_data = CustomImageDataset(csv or df, img_dir_path, transforms)
val_data = CustomImageDataset(csv or df, img_dir_path, transforms)
test_data = CustomImageDataset(csv or df, img_dir_path, transforms)

train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [None]:
# you don't have to implement a DataModule for your first checkpoint
# but you might find it helpful to revisit this for your final submission!

# DataModule comes from PyTorch Lightning, which is a library that 
# streamlined, high-level interface for PyTorch. PyTorch Lightning
# abstracts away a lot of the nitty gritty details that you have to 
# handle (such as moving data between your CPUs and GPUs, implementing
# iteration, etc) and makes your machine learning project simpler

# DataModule encapsulates the five steps involved in data processing in PyTorch:
#   - Download / tokenize / process.
#   - Clean and (maybe) save to disk.
#   - Load inside Dataset.
#   - Apply transforms (rotate, tokenize, etc…).
#   - Wrap inside a DataLoader.


In [None]:
# again, there are several methods you have to implement:
# __init__, setup, train_dataloader, val_dataloader
# and test_dataloader

# prepare_data is a method you will commonly seen included in
# a data module. It is only necessary if you are downloading the
# data using your DM

In [None]:
# WARNING: few months back, PyTorch Lightning became Lightning
# so you will see both:
# import ligthning as L
# import lightning.pytorhc as pl
# import import pytorch_lightning as pl

# depending on the version of lightning or pytorch_lightning you
# are using, all three are valid for now! but the company wants
# people to move over to `import ligthning as L`

In [None]:
import lightning as L 

class OurDataModule(L.LightningDataModule):
    def __init__(self, data_dir, batch_size, transform):
        super().__init__()  # this is a Python thing. it ensures inheritance by the
                            # child class (in our case, `OurDataModule` is the child class
                            # and `L.LightningDataModule` is the parent class
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transform

        # these attributes will be handled by `setup`
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def setup(self):
      se
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)