<a href="https://colab.research.google.com/github/j6k4m8/CIS522-DL-Tribiotics/blob/main/Dataset_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## CIS 522 Final Project - DL Interpretability in COVID X-ray


**Team**: Tribiotics\
**Team Members**: Trevor Chan, Jordan Matelsky, Jiazhen Rong

### Setup

In [1]:
import torch
import torch.nn as nn
from torch.utils import data
from torchvision.models import vgg19
from torchvision import transforms
from torchvision import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torchvision.io import read_image
from torch.utils.data import DataLoader
import random
import IPython
import torchvision
import cv2
from PIL import Image

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [3]:
import opendatasets as od

In [4]:
## download the Kaggle dataset of COVID lung X-ray 
# get your API from Kaggle, see https://www.analyticsvidhya.com/blog/2021/04/how-to-download-kaggle-datasets-using-jupyter-notebook/
od.download("https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database/download")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: jordanmatelsky
Your Kaggle Key: ··········
Downloading covid19-radiography-database.zip to ./covid19-radiography-database


100%|██████████| 778M/778M [00:15<00:00, 54.2MB/s]





In [5]:
# create an annotation file of file name + labels within the image directory
img_dir="/content/covid19-radiography-database/COVID-19_Radiography_Dataset/"
covid_images = os.listdir(img_dir+'COVID/images')
covid_images = [img_dir + 'COVID/images/' + img for img in covid_images]
normal_images = os.listdir(img_dir+'Normal/images')
normal_images = [img_dir + 'Normal/images/' + img for img in normal_images]
pneumonia_images= os.listdir(img_dir+'Viral Pneumonia/images')
pneumonia_images = [img_dir + 'Viral Pneumonia/images/' +img for img in pneumonia_images]
# masking folder
covid_mask = os.listdir(img_dir+'COVID/masks')
covid_mask = [img_dir + 'COVID/masks/' + img for img in covid_mask]
normal_mask = os.listdir(img_dir+'Normal/masks')
normal_mask = [img_dir + 'Normal/masks/' + img for img in normal_mask]
pneumonia_mask = os.listdir(img_dir+'Viral Pneumonia/masks')
pneumonia_mask = [img_dir + 'Viral Pneumonia/masks/' +img for img in pneumonia_mask]

# create label
meta_df = pd.DataFrame(data={'data_path':covid_images+normal_images + pneumonia_images,
                             'label':[0]*len(covid_images) + [1]*len(normal_images) + [2]*len(pneumonia_images),
                              'masking': covid_mask+normal_mask + pneumonia_mask})
meta_df.to_csv(img_dir+"all_file_label.txt",sep='\t')

# split into 80% train, 10% validation and 10% test files
random.seed(522)
train_idx = np.random.choice(list(range(meta_df.shape[0])),size=int(meta_df.shape[0]*0.8),replace=False)
val_idx = np.random.choice(list(set(list(range(meta_df.shape[0]))) - set((train_idx))),size=int(meta_df.shape[0]*0.1),replace=False)
test_idx = np.array(list(set(list(range(meta_df.shape[0]))) - set(val_idx) - set(train_idx)))
meta_df.iloc[train_idx,:].to_csv(img_dir+"train_split.txt",sep='\t')
meta_df.iloc[val_idx,:].to_csv(img_dir+"val_split.txt",sep='\t')
meta_df.iloc[test_idx,:].to_csv(img_dir+"test_split.txt",sep='\t')

### Data Pre-processing

In [6]:
# customized data loader to save memory
class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, transform=None, train_transform=None,target_transform=None,mask_transform=None,train=True,masking=False, antimasking=False):
        self.annotations_file = pd.read_csv(annotations_file,sep='\t',index_col=0)
        #self.img_dir = img_dir
        self.transform = transform
        self.train_transform = train_transform
        self.train=train
        self.target_tranform= target_transform
        self.masking = masking 
        self.antimasking = antimasking
        self.mask_transform = mask_transform

    def __len__(self):
        return len(self.annotations_file)

    def __getitem__(self, idx):
        img_path = self.annotations_file.iloc[idx, 0] # image
        mask_path = self.annotations_file.iloc[idx, 2] # masking
        image = read_image(img_path)
        mask = read_image(mask_path)/255

        ## Change 1x299x299 images to 3x299x299 to keep consistency between datasets.
        if image.shape[0] == 1:
          image = torch.tile(image,(3,1,1))

        # apply masking on image before any other transformation
        if self.masking == True: 
          image = self.mask_transform(image) # resize image to be same size as mask, mask size: 3*256*256
          image = image * mask
        elif self.antimasking == True: 
          image = self.mask_transform(image) # resize image to be same size as mask, mask size: 3*256*256
          image = image * -1 * (1-mask)

        label = self.annotations_file.iloc[idx, 1] # image label, 0 for covid, 1 for normal, 2 for pneumonia

        if self.train: # for training case, additional transformation
            image = self.train_transform(image)
        else:  # for validation and test case
            image = self.transform(image)
        if self.target_tranform != None:
            label = self.target_transform(label)

        return image, label

##### Without masking

In [7]:
annotation_file=img_dir+"all_file_label.txt"
train_file = img_dir+"train_split.txt"
val_file = img_dir+"val_split.txt"
test_file = img_dir+"test_split.txt"

# Define image transformations
# transformations - can add more transformation here, like resize for different models, etc.
all_transforms = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Resize((224, 224)), 
                    transforms.ToTensor(), 
                    # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), # imagenet images common scale factors
                    #transforms.Normalize((0.5), (0.5)), # the covid is only 1D
                    ])

# Make image size size as masking
mask_transform = transforms.Compose([
                    transforms.Resize((256, 256)), 
                    ])
# can add augmentations here for training images
train_transforms = transforms.Compose([
                    transforms.ToPILImage(),
                    # transforms.RandomHorizontalFlip(),
                    transforms.RandomRotation(degrees=5), 
                    transforms.Resize((224, 224)), 
                    transforms.ToTensor(), 
                    # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                    #transforms.Normalize((0.5), (0.5)), # the covid is only 1D
                    ])

full_training_data = CustomImageDataset(
    annotations_file= train_file,
    transform=all_transforms,
    train_transform = train_transforms,
    train=True,
    masking=False,
    antimasking=False,
    mask_transform=mask_transform
)
full_val_data = CustomImageDataset(
    annotations_file= val_file,
    transform=all_transforms,
    train=False,
    masking=False,
    antimasking=False,
    mask_transform=mask_transform
)
full_test_data = CustomImageDataset(
    annotations_file= test_file,
    transform=all_transforms,
    train=False,
    masking=False,
    antimasking=False,
    mask_transform=mask_transform
)


mask_training_data = CustomImageDataset(
    annotations_file= train_file,
    transform=all_transforms,
    train_transform = train_transforms,
    train=True,
    masking=True,
    antimasking=False,
    mask_transform=mask_transform
)
mask_val_data = CustomImageDataset(
    annotations_file= val_file,
    transform=all_transforms,
    train=False,
    masking=True,
    antimasking=False,
    mask_transform=mask_transform
)
mask_test_data = CustomImageDataset(
    annotations_file= test_file,
    transform=all_transforms,
    train=False,
    masking=True,
    antimasking=False,
    mask_transform=mask_transform
)


anti_training_data = CustomImageDataset(
    annotations_file= train_file,
    transform=all_transforms,
    train_transform = train_transforms,
    train=True,
    masking=False,
    antimasking=True,
    mask_transform=mask_transform
)
anti_val_data = CustomImageDataset(
    annotations_file= val_file,
    transform=all_transforms,
    train=False,
    masking=False,
    antimasking=True,
    mask_transform=mask_transform
)
anti_test_data = CustomImageDataset(
    annotations_file= test_file,
    transform=all_transforms,
    train=False,
    masking=False,
    antimasking=True,
    mask_transform=mask_transform
)

In [8]:
full_train_dataloader = DataLoader(full_training_data, batch_size=64, shuffle=True)
full_val_dataloader = DataLoader(full_val_data, batch_size=64, shuffle=True)
full_test_dataloader = DataLoader(full_test_data, batch_size=64, shuffle=True)

mask_train_dataloader = DataLoader(mask_training_data, batch_size=64, shuffle=True)
mask_val_dataloader = DataLoader(mask_val_data, batch_size=64, shuffle=True)
mask_test_dataloader = DataLoader(mask_test_data, batch_size=64, shuffle=True)

anti_train_dataloader = DataLoader(anti_training_data, batch_size=64, shuffle=True)
anti_val_dataloader = DataLoader(anti_val_data, batch_size=64, shuffle=True)
anti_test_dataloader = DataLoader(anti_test_data, batch_size=64, shuffle=True)

In [23]:
classes = ('COVID', 'Normal', 'Pneumonia')
def render_classes():
    render = 0
    for (imgs, labels) in full_train_dataloader:
        for img, lab in zip(imgs, labels):
            if render == lab:
                plt.title(classes[render])
                plt.imshow(img[0,:,:], cmap='Greys_r')
                plt.show()
                render += 1
            if render == 3:
                return
render_classes()

In [29]:
classes = ('COVID', 'Normal', 'Pneumonia')


In [28]:
meta_df.groupby("label").count()

Unnamed: 0_level_0,data_path,masking
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3616,3616
1,10192,10192
2,1345,1345
