### Download from https://github.com/ieee8023/covid-chestxray-dataset

Make a folder named covid and put the images and metadata.csv files in it

In [None]:
# Import libraries

import os, sys
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from skimage.transform import resize
from skimage import io

ROOT_DIR = os.path.abspath("../")

sys.path.append(ROOT_DIR)  
import helpers


DATASET_NAME = "covid"
DATA_PATH = os.path.join(ROOT_DIR, "dataset")
COVID_DATASET_PATH = os.path.join(DATA_PATH, DATASET_NAME)

#### Read labels

In [None]:
dt = pd.read_csv(os.path.join(COVID_DATASET_PATH, 'metadata.csv'))
dt = dt[["finding", "view", "modality", "filename"]].dropna()
# Remove all CTs
dt = dt[dt.modality != "CT"]
# Only samples with COVID-19
dt = dt[dt.finding == "COVID-19"]

dt.head()

In [None]:
# Labels
labels = dt[["finding"]].values

In [None]:
image_path = dt["filename"].values
image_path[:10]

In [None]:
# make dir
train_covid_images = os.path.join(DATA_PATH, "cxr", "train", "cov19")
test_covid_images = os.path.join(DATA_PATH, "cxr", "test", "cov19")

helpers.create_directory(train_covid_images)
helpers.create_directory(test_covid_images)

In [None]:
# slicing index should be the minmum number across all the dataset classes

tr_imgs = [os.path.join(COVID_DATASET_PATH, "images", x) for x in image_path[:180]]
test_imgs = [os.path.join(COVID_DATASET_PATH, "images", x) for x in image_path[180:]]

len(tr_imgs), len(test_imgs)

In [None]:
import shutil

for file in tr_imgs:
    shutil.copy(file, train_covid_images)
    
for file in test_imgs:
    shutil.copy(file, test_covid_images)

In [None]:
# COVID: 226 -> 180 + 46

### Data loader

In [None]:
# Classes for data loading and preprocessing
class COVIDChestXRayDataset:
    def __init__(
            self,
            datadir,
            csv_path,
            flag,
    ):
        
        # Patient names in folder
        #self.ids = sorted(os.listdir(datadir))
        # Sorted patient names in folder
        #self.images_fps = [os.path.join(datadir, image_id) for image_id in self.ids]
        

        # Read csv path
        csv = pd.read_csv(os.path.join(csv_path))
        csv = csv[["survival", "modality", "filename"]].dropna()
        csv = csv[csv.modality != "CT"]
        
        # Image names
        self.image_names = csv["filename"].values
        
        
        # Get labels
        self.labels = csv[["survival"]].values
        self.image_paths = [os.path.join(datadir, image_id) for image_id in self.image_names]
        
        # Split
        train_vols, test_vols, train_labels, test_labels = train_test_split(self.image_paths, self.labels, test_size=0.20, random_state=42)
        print(len(train_vols), len(test_vols))
        
        self.train_vols = train_vols
        self.test_vols = test_vols
        self.train_labels = train_labels
        self.test_labels = test_labels
        
        if flag == "train":
            self.image_paths = self.train_vols
            self.labels = self.train_labels
            self.ids = self.train_vols
            
        else:
            self.image_paths = self.test_vols
            self.labels = self.test_labels
            self.ids = self.test_vols
        
        
    def __getitem__(self, i):
        
        # Read data
        img = io.imread(self.image_paths[i])
        img = resize(img, (256, 256))
        
        img = img.astype(np.float32)
        img /= 255
        
        # Taken from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
        # Check that images are 2D arrays
        if len(img.shape) > 2:
            img = img[:, :, 0]
        if len(img.shape) < 2:
            print("error, dimension lower than 2 for image")

        # Add color channel
        img = img[:, :, None]
        
        # Get labels
        gt = self.labels[i]
        if gt == "Y":
            gt = 1 # Survival: Yes
        else:
            gt = 0 # Survial: No
            
        gt = keras.utils.to_categorical(gt, 2)
        
        return img, gt
        
    def __len__(self):
        return len(self.ids)

In [None]:
class Dataloder(keras.utils.Sequence):
    """Load data from dataset and form batches
    
    Args:
        dataset: instance of Dataset class for image loading and preprocessing.
        batch_size: Integet number of images in batch.
        shuffle: Boolean, if `True` shuffle image indexes each epoch.
    """
    
    def __init__(self, dataset, batch_size=1, shuffle=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(dataset))

        self.on_epoch_end()

    def __getitem__(self, i):
        
        # collect batch data
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])
        
        # Transpose list of lists
        batch = [np.stack(samples, axis=0) for samples in zip(*data)]
        
        return batch
    
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return len(self.indexes) // self.batch_size
    
    def on_epoch_end(self):
        """Callback function to shuffle indexes each epoch"""
        if self.shuffle:
            self.indexes = np.random.permutation(self.indexes)

In [None]:
CSV_PATH = os.path.join(DATASET_PATH, "metadata.csv")
CSV_PATH

In [None]:
# image path, csv path
train_dataset = COVIDChestXRayDataset(IMAGES_PATH, CSV_PATH, flag="train")
test_dataset = COVIDChestXRayDataset(IMAGES_PATH, CSV_PATH, flag="test")

In [None]:
image, gt = train_dataset[0] 
image.shape, gt.shape

In [None]:
gt

In [None]:
image = np.squeeze(image)
plt.imshow(image, cmap='gray')

In [None]:
train_dataloader = Dataloder(train_dataset, batch_size=1, shuffle=True)
test_dataloader = Dataloder(test_dataset, batch_size=1, shuffle=True)

In [None]:
len(train_dataloader), len(test_dataloader)

In [None]:
for batch_idx, (features, targets) in enumerate(train_dataloader):
    print(batch_idx, features.shape, targets.shape)

In [None]:
for batch_idx, (features, targets) in enumerate(test_dataloader):
    print(batch_idx, features.shape, targets.shape)

In [None]:
# Define callbacks for learning rate scheduling, logging and best checkpoints saving
callbacks = [
    keras.callbacks.ModelCheckpoint('{}/{}.h5'.format(LOG_PATH, EXPERIMENT_NAME), monitor='val_loss', save_best_only=True, mode='min'),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, verbose=1, patience=5, mode='min'), ## new_lr = lr * factor # 5
    keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, verbose=1, patience=15, mode='min', restore_best_weights=True), # 8
    keras.callbacks.CSVLogger('{}/training.csv'.format(LOG_PATH))
]

In [None]:

start_time = time.time()


history = model.fit_generator(
    train_dataloader, 
    steps_per_epoch=len(train_dataloader), 
    epochs=EPOCHS, 
    callbacks=callbacks, 
    validation_data=valid_dataloader, 
    validation_steps=len(valid_dataloader),  # val samples = batch size * no of steps
)

end_time = time.time()
print("--- Time taken to train : %s hours ---" % ((end_time - start_time)//3600))