In [2]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile
import pydicom as dicom
import cv2
import mdai
import json
from collections import Counter
from pathlib import Path

In [3]:
# set data directory here
savepath = 'data_sev'
Path(os.path.join(savepath, 'test')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(savepath, 'train')).mkdir(parents=True, exist_ok=True)

seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# COVIDxSev requires the path to the ricord annotations to also be downloaded
ricord_annotations = 'create_ricord_dataset/1c_mdai_rsna_project_MwBeK3Nr_annotations_labelgroup_all_2021-01-08-164102.json'

# path to ricord covid-19 images created by create_ricord_dataset/create_ricord_dataset.ipynb
# run create_ricord_dataset.ipynb before this notebook
ricord_imgpath = 'create_ricord_dataset/ricord_images'
ricord_txt = 'create_ricord_dataset/ricord_data_set.txt'
ricord_studyids = 'create_ricord_dataset/ricord_patientid_to_studyid_mapping.json'



# parameters for COVIDx dataset
train = []
test = []
test_count = {'level1': 0,'level2': 0, 'NA': 0}
train_count = {'level1': 0,'level2': 0, 'NA': 0}



# to avoid duplicates
patient_imgpath = {}

In [4]:
mapping = {}
mapping['Mild Opacities  (1-2 lung zones)'] = 'level1'
mapping['Moderate Opacities (3-4 lung zones)'] = 'level2'
mapping['Severe Opacities (>4 lung zones)'] = 'level2'
mapping['Invalid Study'] = 'NA'

classification=["Typical Appearance","Indeterminate Appearance","Atypical Appearance","Negative for Pneumonia"]
airspace_Disease_Grading=["Mild Opacities  (1-2 lung zones)","Moderate Opacities (3-4 lung zones)","Severe Opacities (>4 lung zones)","Invalid Study"]

        
        
def get_label_study(annotations_df, studyid):
    airspace_grading_labels = []
    labels = annotations_df["annotations"].loc[annotations_df["annotations"]["StudyInstanceUID"]==studyid]["labelName"]
#     print(labels)
    for label in list(labels):
        if label in mapping.keys():
            airspace_grading_labels.append(mapping[label])
    
    severity = Counter(airspace_grading_labels).most_common()[0][0] if airspace_grading_labels else 'NA'
    return severity


In [5]:
filename_label = {'level1': [],'level2': [], 'NA': []}
count = {'level1': 0,'level2': 0, 'NA':0}
covid_ds = {'ricord': []}
        
# get ricord file names 
with open(ricord_txt) as f:
    ricord_file_names = [line.split()[0] for line in f]
    
# get study ids for every patientid
with open(ricord_studyids, 'r') as f:
    studyids = json.load(f)
    
# load ricord annotations
annotations = mdai.common_utils.json_to_dataframe(ricord_annotations)

for imagename in ricord_file_names:
    patientid = imagename.split('-')[3] + '-' + imagename.split('-')[4]
    study_uuid = imagename.split('-')[-2]
    
    # get complete study id from ricord_studyids json file to match to labels stored in ricord annotations
    for studyid in studyids[patientid]:
        if studyid[-5:] == study_uuid:
            severity_level = get_label_study(annotations, studyid)
            break
    count[severity_level] += 1
    entry = [patientid, imagename, severity_level, 'ricord']
    filename_label[severity_level].append(entry)
    
    covid_ds['ricord'].append(patientid)
    
print('Data distribution from covid datasets:')
print(count)

FileNotFoundError: [Errno 2] No such file or directory: 'create_ricord_dataset/ricord_data_set.txt'

In [76]:
# Write images into train and test directories accordingly

# get test patients from label file
with open('labels/test_COVIDxSev.txt', 'r') as f:
    test_patients = [line.split()[0] for line in f]

for label in filename_label.keys():
    # Skip all studyies that do not have an airspace grading
    if label != 'NA':
        for image in filename_label[label]:
            patientid = image[0]
            if patientid in test_patients:
                copyfile(os.path.join(ricord_imgpath, image[1]), os.path.join(savepath, 'test', image[1]))
                test.append(image)
                test_count[image[2]] += 1
            else:
                copyfile(os.path.join(ricord_imgpath, image[1]), os.path.join(savepath, 'train', image[1]))
                train.append(image)
                train_count[image[2]] += 1

print('test count: ', test_count)
print('train count: ', train_count)

test count:  {'level1': 52, 'level2': 98, 'NA': 0}
train count:  {'level1': 174, 'level2': 585, 'NA': 0}


In [77]:
# final stats
print('Final stats')
print('Train count: ', train_count)
print('Test count: ', test_count)
print('Total length of train: ', len(train))
print('Total length of test: ', len(test))

Final stats
Train count:  {'level1': 174, 'level2': 585, 'NA': 0}
Test count:  {'level1': 52, 'level2': 98, 'NA': 0}
Total length of train:  759
Total length of test:  150


In [78]:
# export to train and test files
# format as patientid, filename, label, separated by a space
# where label is either "level1" for mild air space grading or "level2" for moderate and severe grading
with open("train_split.txt",'w') as train_file:
    for sample in train:
        info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + ' ' + sample[3] + '\n'
        train_file.write(info)

with open("test_split.txt", 'w') as test_file:
    for sample in test:
        info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + ' ' + sample[3] + '\n'
        test_file.write(info)