# COVID-19 X-ray

In [30]:
import pandas as pd
import os
import cv2
import random
import pickle

from tqdm import tqdm

In [44]:
IMAGE_SIZE = 224
CLASSES = 4
if CLASSES == 4: 
    FOLD = 8
    categories = ["COVID-19", "Viral", "Bacterial", "Normal"] # label: 0, 1, 2, 3

else: 
    FOLD = 24
    categories = ["COVID-19", "non-COVID-19"] # label: 0, 1

In [45]:
# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
covid_image_path = './Dataset/covid-chestxray-dataset-master/images/' 
covid_csv_path = './Dataset/covid-chestxray-dataset-master/metadata.csv'

# path to CellData
xray_image_path = './Dataset/CellData/chest_xray/'

# select COVID-19 x-rays from ieee dataset
covid_csv = pd.read_csv(covid_csv_path, nrows=None)
image_idx = (covid_csv["finding"].str.contains("COVID-19")) & (covid_csv["modality"] == "X-ray") & (covid_csv["view"] != "L")
covid_csv = covid_csv[image_idx]

# read COVID-19 images
covid_image = []
for filename in covid_csv['filename']:
    imagefile=cv2.imread(covid_image_path + filename)
    imagefile = cv2.resize(imagefile, (IMAGE_SIZE,IMAGE_SIZE)) 
    covid_image.append(imagefile)

In [46]:
# split train and test for COVID-19
random.Random(0).shuffle(covid_image) # random seed: 0

split_idx = 172

covid19_image = {'train':[], 'test':[]}
covid19_image['train'] = covid_image[:split_idx]
covid19_image['test'] = covid_image[split_idx:]

In [47]:
# read x-ray images

viral_image = {'train':[], 'test':[]}
bacterial_image = {'train':[], 'test':[]}
normal_image = {'train':[], 'test':[]}

for label in ["train", "test"]:
    for category in ["NORMAL", "PNEUMONIA"]:
        dirs = xray_image_path + label + '/' + category
        files = os.listdir(dirs)
        for i in tqdm(range(len(files))):
            filesname = dirs + '/' + files[i]
            if filesname == dirs + '/.DS_Store':
                continue
            imagefile=cv2.imread(filesname)
            imagefile = cv2.resize(imagefile, (IMAGE_SIZE,IMAGE_SIZE))
            if category == "NORMAL":
                normal_image[label].append(imagefile)
            elif "BACTERIA" in filesname:
                bacterial_image[label].append(imagefile)
            else:
                viral_image[label].append(imagefile)

100%|██████████| 1349/1349 [00:25<00:00, 53.65it/s]
100%|██████████| 3884/3884 [00:23<00:00, 164.56it/s]
100%|██████████| 235/235 [00:03<00:00, 59.27it/s]
100%|██████████| 390/390 [00:02<00:00, 193.06it/s]


In [48]:
# Gather trainset and testset
covid19_image['train'] *= FOLD

train_image = covid19_image['train'] + viral_image['train']\
    + bacterial_image['train'] + normal_image['train']

test_image = covid19_image['test'] + viral_image['test'] \
    + bacterial_image['test'] + normal_image['test']

train_label = [0]*len(covid19_image['train']) + [1]*len(viral_image['train'])\
    + [2]*len(bacterial_image['train']) + [3]*len(normal_image['train'])

test_label = [0]*len(covid19_image['test']) + [1]*len(viral_image['test'])\
    + [2]*len(bacterial_image['test']) + [3]*len(normal_image['test'])

In [49]:
if CLASSES == 2:
    train_label = [e if e == 0 else 1 for e in train_label]
    test_label = [e if e == 0 else 1 for e in test_label]

In [50]:
data = {'train image':[], 'train label':[],
        'test image':[], 'test label':[]}

In [51]:
train_counts = {categories[i]: sum([e == i for e in train_label]) for i in range(len(categories))}
test_counts = {categories[i]: sum([e == i for e in test_label]) for i in range(len(categories))}

print(train_counts)
print(test_counts)

{'COVID-19': 1376, 'Viral': 1345, 'Bacterial': 2538, 'Normal': 1349}
{'COVID-19': 60, 'Viral': 148, 'Bacterial': 242, 'Normal': 234}


In [52]:
train_image_label_pair = list(zip(train_image, train_label))
test_image_label_pair = list(zip(test_image, test_label))

In [53]:
random.Random(0).shuffle(train_image_label_pair) # random seed: 0
random.Random(0).shuffle(test_image_label_pair) # random seed: 0

In [54]:
train_image, train_label = zip(*train_image_label_pair)
test_image, test_label = zip(*test_image_label_pair)

In [55]:
data['train image'] = list(train_image)
data['train label'] = list(train_label)
data['test image'] = list(test_image)
data['test label'] = list(test_label)

In [56]:
with open('./Dataset/data'+str(IMAGE_SIZE)+'Class'+str(CLASSES)+'.pkl', 'wb') as f:
  pickle.dump(data,f)
  f.close()