### Import libraries

In [4]:
import sys
!{sys.executable} -m pip install pydicom

Collecting pydicom
  Using cached pydicom-2.1.2-py3-none-any.whl (1.9 MB)
Installing collected packages: pydicom
Successfully installed pydicom-2.1.2


In [17]:
!{sys.executable} -m pip install opencv-python

Collecting opencv-python
  Using cached opencv_python-4.5.1.48-cp38-cp38-win_amd64.whl (34.9 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.5.1.48


In [1]:
import pandas as pd
import os
import shutil
from glob import glob
import pydicom as dicom
import cv2

In [2]:
cohen_dir = 'covid-chestxray-dataset'
cohen_csv_path = f'./{cohen_dir}/metadata.csv'
cohen_img_path = f'./{cohen_dir}/images'

In [3]:
fig1_dir = 'Figure1-COVID-chestxray-dataset'
fig1_csv_path = f'./{fig1_dir}/metadata.csv'
fig1_img_path = f'./{fig1_dir}/images'

In [4]:
actmed_dir = 'Actualmed-COVID-chestxray-dataset'
actmed_csv_path = f'./{actmed_dir}/metadata.csv'
actmed_img_path = f'./{actmed_dir}/images'

In [5]:
sirm_dir = 'COVID-19-Radiography-Database'
sirm_cov_csv_path = f'./{sirm_dir}/COVID.metadata.xlsx'
sirm_cov_img_path = f'./{sirm_dir}/COVID'

In [6]:
rsna_csv1_path = './stage_2_detailed_class_info.csv'
rsna_csv2_path = './stage_2_train_labels.csv'
rsna_img_path = './stage_2_train_images'

### Initialize directories, files and metrics variables

In [33]:
dataset_dir = 'dataset'

images_dir = f'{dataset_dir}/images'
data_file_name = 'data_file.csv'

deletions = 0

if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)
    print('Successfully created \'dataset\' directory created.')

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print('Successfully created \'images\' directory created.')
else:
    files = glob(f'{images_dir}/*')
    for f in files:
        try:
            os.remove(f)
            deletions += 1
        except OSError as e:
            print('Error: %s : %s' % (f, e.strerror))
    print(f'Successfully deleted {deletions} images from the \'images\' directory')

data_file = open(f'./{dataset_dir}/{data_file_name}', 'w')
data_file.write('filename,diagnosis\n')
data_file.close()
print(f'Successfully deleted {deletions} entries from \'{data_file_name}\'')

covid_positives = 0
covid_negatives = 0
xray_index = 0

Successfully deleted 185 images from the 'images' directory
Successfully deleted 185 entries from 'data_file.csv'


### Extract data from 'covid-chestray-dataset'

In [34]:
cohen_csv = pd.read_csv(cohen_csv_path, nrows=None)
print(cohen_csv.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in cohen_csv.iterrows():
    if row['folder'] == 'volumes':
        continue
    
    image = row['filename']
    
    src_path = f'./{cohen_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding'].split('/')[-1]
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
        
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

(950, 30)
x-ray-0.jpeg : OK
x-ray-1.jpeg : OK
x-ray-2.jpeg : OK
x-ray-3.jpeg : OK
x-ray-4.jpeg : OK
x-ray-5.jpeg : OK
x-ray-6.png : OK
x-ray-7.jpg : OK
x-ray-8.jpg : OK
x-ray-9.jpeg : OK
x-ray-10.jpeg : OK
x-ray-11.jpeg : OK
x-ray-12.jpeg : OK
x-ray-13.jpeg : OK
x-ray-14.jpeg : OK
x-ray-15.jpeg : OK
x-ray-16.jpeg : OK
x-ray-17.jpeg : OK
x-ray-18.jpeg : OK
x-ray-19.jpeg : OK
x-ray-20.jpg : OK
x-ray-21.jpeg : OK
x-ray-22.jpeg : OK
x-ray-23.jpeg : OK
x-ray-24.jpeg : OK
x-ray-25.jpeg : OK
x-ray-26.jpeg : OK
x-ray-27.jpeg : OK
x-ray-28.jpeg : OK
x-ray-29.jpeg : OK
x-ray-30.jpeg : OK
x-ray-31.jpeg : OK
x-ray-32.jpeg : OK
x-ray-33.jpeg : OK
x-ray-34.jpg : OK
x-ray-35.jpg : OK
x-ray-36.jpg : OK
x-ray-37.jpg : OK
x-ray-38.jpg : OK
x-ray-39.jpeg : OK
x-ray-40.jpg : OK
x-ray-41.jpg : OK
x-ray-42.jpg : OK
x-ray-43.jpg : OK
x-ray-44.jpg : OK
x-ray-45.jpg : OK
x-ray-46.jpg : OK
x-ray-47.jpg : OK
x-ray-48.jpg : OK
x-ray-49.jpg : OK
x-ray-50.jpg : OK
x-ray-51.jpg : OK
x-ray-52.jpg : OK
x-ray-53.png : 

x-ray-477.jpg : OK
x-ray-478.jpg : OK
x-ray-479.jpg : OK
x-ray-480.jpg : OK
x-ray-481.jpg : OK
x-ray-482.jpg : OK
x-ray-483.jpg : OK
x-ray-484.jpg : OK
x-ray-485.jpg : OK
x-ray-486.png : OK
x-ray-487.png : OK
x-ray-488.png : OK
x-ray-489.png : OK
x-ray-490.png : OK
x-ray-491.jpg : OK
x-ray-492.jpg : OK
x-ray-493.jpg : OK
x-ray-494.jpg : OK
x-ray-495.png : OK
x-ray-496.png : OK
x-ray-497.png : OK
x-ray-498.png : OK
x-ray-499.jpg : OK
x-ray-500.jpg : OK
x-ray-501.jpg : OK
x-ray-502.jpg : OK
x-ray-503.jpg : OK
x-ray-504.jpg : OK
x-ray-505.jpg : OK
x-ray-506.jpg : OK
x-ray-507.jpg : OK
x-ray-508.jpg : OK
x-ray-509.jpg : OK
x-ray-510.jpg : OK
x-ray-511.jpg : OK
x-ray-512.jpg : OK
x-ray-513.jpg : OK
x-ray-514.jpg : OK
x-ray-515.jpg : OK
x-ray-516.jpg : OK
x-ray-517.png : OK
x-ray-518.png : OK
x-ray-519.png : OK
x-ray-520.png : OK
x-ray-521.png : OK
x-ray-522.png : OK
x-ray-523.png : OK
x-ray-524.png : OK
x-ray-525.png : OK
x-ray-526.png : OK
x-ray-527.png : OK
x-ray-528.png : OK
x-ray-529.pn

### Extract data from 'Figure1-COVID-chestxray-dataset'

In [35]:
fig1_csv = pd.read_csv(fig1_csv_path, encoding='ISO-8859-1', nrows=None)
print(fig1_csv.shape)

fig1_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in fig1_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    patient_id = row['patientid']
    image = f'{patient_id}.jpg'
    
    src_path = f'{fig1_img_path}/{image}'
    if not os.path.exists(src_path):
        image = f'{patient_id}.png'
        src_path = f'{fig1_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

(55, 12)
x-ray-929.jpg : OK
x-ray-930.jpg : OK
x-ray-931.jpg : OK
x-ray-932.jpg : OK
x-ray-933.jpg : OK
x-ray-934.jpg : OK
x-ray-935.jpg : OK
x-ray-936.jpg : OK
x-ray-937.jpg : OK
x-ray-938.jpg : OK
x-ray-939.jpg : OK
x-ray-940.jpg : OK
x-ray-941.jpg : OK
x-ray-942.jpg : OK
x-ray-943.jpg : OK
x-ray-944.jpg : OK
x-ray-945.png : OK
x-ray-946.png : OK
x-ray-947.jpg : OK
x-ray-948.jpg : OK
x-ray-949.jpg : OK
x-ray-950.jpg : OK
x-ray-951.jpg : OK
x-ray-952.jpg : OK
x-ray-953.jpg : OK
x-ray-954.jpg : OK
x-ray-955.jpg : OK
x-ray-956.jpg : OK
x-ray-957.jpg : OK
x-ray-958.jpg : OK
x-ray-959.jpg : OK
x-ray-960.jpg : OK
x-ray-961.jpg : OK
x-ray-962.jpg : OK
x-ray-963.jpg : OK
x-ray-964.jpg : OK
x-ray-965.jpg : OK
x-ray-966.jpg : OK
x-ray-967.jpg : OK
x-ray-968.jpg : OK
x-ray-969.jpg : OK
x-ray-970.jpg : OK
x-ray-971.jpg : OK
x-ray-972.jpg : OK
x-ray-973.jpg : OK
x-ray-974.jpg : OK
x-ray-975.jpg : OK
x-ray-976.jpg : OK
x-ray-977.jpg : OK
x-ray-978.jpg : OK
x-ray-979.jpg : OK
x-ray-980.png : OK
x-r

### Extract data from 'Actualmed-COVID-chestxray-dataset'

In [36]:
actmed_csv = pd.read_csv(actmed_csv_path, nrows=None)
print(actmed_csv.shape)

actmed_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in actmed_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    image = row['imagename']
    
    src_path = f'{actmed_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

(238, 13)
x-ray-984.png : OK
x-ray-985.png : OK
x-ray-986.png : OK
x-ray-987.png : OK
x-ray-988.png : OK
x-ray-989.png : OK
x-ray-990.png : OK
x-ray-991.png : OK
x-ray-992.png : OK
x-ray-993.png : OK
x-ray-994.png : OK
x-ray-995.png : OK
x-ray-996.png : OK
x-ray-997.png : OK
x-ray-998.png : OK
x-ray-999.png : OK
x-ray-1000.png : OK
x-ray-1001.png : OK
x-ray-1002.png : OK
x-ray-1003.png : OK
x-ray-1004.png : OK
x-ray-1005.png : OK
x-ray-1006.png : OK
x-ray-1007.png : OK
x-ray-1008.png : OK
x-ray-1009.png : OK
x-ray-1010.png : OK
x-ray-1011.png : OK
x-ray-1012.png : OK
x-ray-1013.png : OK
x-ray-1014.png : OK
x-ray-1015.png : OK
x-ray-1016.png : OK
x-ray-1017.png : OK
x-ray-1018.png : OK
x-ray-1019.png : OK
x-ray-1020.png : OK
x-ray-1021.png : OK
x-ray-1022.png : OK
x-ray-1023.png : OK
x-ray-1024.png : OK
x-ray-1025.png : OK
x-ray-1026.png : OK
x-ray-1027.png : OK
x-ray-1028.png : OK
x-ray-1029.png : OK
x-ray-1030.png : OK
x-ray-1031.png : OK
x-ray-1032.png : OK
x-ray-1033.png : OK
x-ray-

#### Due to potential duplicates and other uncertainties, the datasets that the below cells are exploiting aren't being used as of yet.

### Extract data from 'COVID-19-Radiography-Database'

In [None]:
sirm_cov_csv = pd.read_excel(sirm_cov_csv_path)
print(sirm_cov_csv.shape)

cohen_urls = set(cohen_csv['url'])

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in sirm_cov_csv.iterrows():
    if row['URL'] in cohen_urls:
        continue
    
    image_type = row['FORMAT'].lower()
    image = f"COVID ({row['FILE NAME'].split(' ')[1]}).{image_type}"
    
    src_path = f'./{sirm_cov_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    sample = new_image_name + ',' + 'positive' + '\n'
    covid_positives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

In [None]:
rsna_csv2 = pd.read_csv(rsna_csv2_path, nrows=None)
print(rsna_csv2.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in rsna_csv2.iterrows():
    patient_id = row['patientId']
    dcm_path = f'{rsna_img_path}/{patient_id}.dcm'
    
    ds = dicom.dcmread(dcm_path)
    pixel_array = ds.pixel_array
    
    new_image_name = f'x-ray-{xray_index}.png'
    dst_path = f'{images_dir}/{new_image_name}'
    
    cv2.imwrite(dst_path, pixel_array)
    
    sample = new_image_name + ',' + 'negative' + '\n'
    covid_negatives += 1
    
    data_file.write(sample)    
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()