# COVIDx-CT Dataset Constructor
This notebook constructs the COVIDx-CT dataset from scratch using the raw data. See [docs/dataset.md](docs/dataset.md) for more details on manual steps which must be completed beforehand.

In [1]:
import os
import cv2
import glob
import numpy as np

from dataset_construction.utils import CLASS_MAP

## Setting Paths
Paths to the source data and output location should be set here. Note that the window width and window level for processing scans in Hounsfield units are defined in [data_utils.py](data_utils.py) as `HU_WINDOW_WIDTH = 1500` and `HU_WINDOW_LEVEL = -600`.

In [2]:
# CNCB AI Diagnosis paths
CNCB_EXCLUDE_FILE = 'dataset_construction/metadata/cncb_exclude_list.txt'
CNCB_DIR = 'data/CNCB_AI_Diagnosis'

# Radiopaedia/coronacases segmentation data paths
RADIOPAEDIA_CORONACASES_CT_DIR = 'data/Coronacases_Radiopaedia/COVID-19-CT-Seg_20cases'
RADIOPAEDIA_CORONACASES_SEG_DIR = 'data/Coronacases_Radiopaedia/Infection_Mask'

# LIDC-IDRI paths
LIDC_META_CSV = 'dataset_construction/metadata/lidc_idri_metadata.csv'

# COVID-19-20 paths
COVID_19_20_DIR = 'data/COVID-19-20_v2/Train'

# COVID-CTset paths
COVID_CTSET_META_CSV = 'data/COVID-CTset/Labels&Detailes/Patient_details.csv'
COVID_CTSET_DIR = 'data/COVID-CTset/Train&Validation'

# Output directory path
OUTPUT_DIR = 'data/COVIDx-CT'  # directory to save the images in

# Make output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Data Preparation
These cells process the data from each source.

In [3]:
# Filename and class lists
filenames, classes = [], []

In [4]:
# Process CNCB AI Diagnosis data
from dataset_construction import cncb
fnames, cls = cncb.process_cncb_data(CNCB_EXCLUDE_FILE, CNCB_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|███████████████████████████████████████████████████████████████████████| 103981/103981 [00:04<00:00, 23360.60it/s]


In [5]:
# Process radiopaedia and coronacases data
from dataset_construction import radiopaedia_coronacases as rc
fnames, cls = rc.process_radiopaedia_and_coronacases_seg_data(
    RADIOPAEDIA_CORONACASES_CT_DIR, RADIOPAEDIA_CORONACASES_SEG_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:45<00:00,  2.27s/it]


In [6]:
# Process LIDC-IDRI data
from dataset_construction import lidc_idri
fnames, cls = lidc_idri.process_lidc_idri_data(LIDC_META_CSV, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [02:00<00:00,  3.09s/it]


In [7]:
# Process COVID-19-20 challenge data
from dataset_construction import covid_19_20
fnames, cls = covid_19_20.process_covid_19_20_data(COVID_19_20_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 199/199 [03:37<00:00,  1.09s/it]


In [8]:
# Process COVID-CTset data
from dataset_construction import covid_ctset
fnames, cls = covid_ctset.process_covid_ctset_data(COVID_CTSET_META_CSV, COVID_CTSET_DIR, OUTPUT_DIR)
filenames.extend(fnames)
classes.extend(cls)

100%|████████████████████████████████████████████████████████████████████████████████| 377/377 [00:11<00:00, 33.54it/s]


In [9]:
# Print image counts
uniq_classes, counts = np.unique(classes, return_counts=True)
print('Image Counts')
for name, cls in CLASS_MAP.items():
    print('{}: {}'.format(name, counts[uniq_classes == cls]))

Image Counts
Normal: [59533]
Pneumonia: [36839]
COVID-19: [29819]


## Verification
Perform a check to ensure that all files are present (optional).

In [12]:
dataset_version = 'v2'  # may be changed to check for different versions
split_files = glob.glob(os.path.join('splits/' + dataset_version, '*_COVIDx-CT_v*.txt'))
if not len(split_files):
    raise ValueError('Split files for COVIDx-CT {} not found'.format(dataset_version))
count = 0
total = 0
incomplete = False
for split_file in split_files:
    with open(split_file, 'r') as f:
        for line in f.readlines():
            fname = line.split()[0]
            fpath = os.path.join(OUTPUT_DIR, fname)
            
            total += 1
            if os.path.exists(fpath):
                count += 1
            else:
                print('Missing', fname)
                incomplete = True
            
if incomplete:
    print('{}/{} files are missing, dataset is incomplete!'.format(count, total))
else:
    print('{}/{} files created, dataset successfully constructed!'.format(count, total))

126191/126191 files created, dataset successfully constructed!
