### Import libraries

In [2]:
import sys
!{sys.executable} -m pip install pydicom

Collecting pydicom
  Downloading pydicom-2.1.2-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 18.9 MB/s eta 0:00:01
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.1.2


In [None]:
!{sys.executable} -m pip install python-dotenv

In [3]:
!{sys.executable} -m pip install opencv-python



In [4]:
import pandas as pd
import os
import shutil
from glob import glob
import pydicom as dicom
import cv2
import uuid
from PIL import Image
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings, __version__

In [None]:
cohen_dir = 'covid-chestxray-dataset'
cohen_csv_path = f'./{cohen_dir}/metadata.csv'
cohen_img_path = f'./{cohen_dir}/images'

In [5]:
fig1_dir = 'Figure1-COVID-chestxray-dataset'
fig1_csv_path = f'./{fig1_dir}/metadata.csv'
fig1_img_path = f'./{fig1_dir}/images'

In [9]:
actmed_dir = 'Actualmed-COVID-chestxray-dataset'
actmed_csv_path = f'./{actmed_dir}/metadata.csv'
actmed_img_path = f'./{actmed_dir}/images'

In [None]:
sirm_dir = 'COVID-19-Radiography-Database'
sirm_cov_csv_path = f'./{sirm_dir}/COVID.metadata.xlsx'
sirm_cov_img_path = f'./{sirm_dir}/COVID'

In [None]:
rsna_csv1_path = './stage_2_detailed_class_info.csv'
rsna_csv2_path = './stage_2_train_labels.csv'
rsna_img_path = './stage_2_train_images'

### Initialize directories, files and metrics variables

In [12]:
dataset_dir = 'dataset'

images_dir = f'{dataset_dir}/images'
data_file_name = 'data_file.csv'

deletions = 0

if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)
    print('Successfully created \'dataset\' directory created.')

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print('Successfully created \'images\' directory created.')
else:
    files = glob(f'{images_dir}/*')
    for f in files:
        try:
            os.remove(f)
            deletions += 1
        except OSError as e:
            print('Error: %s : %s' % (f, e.strerror))
    print(f'Successfully deleted {deletions} images from the \'images\' directory')

data_file = open(f'./{dataset_dir}/{data_file_name}', 'w')
data_file.write('filename,diagnosis\n')
data_file.close()
print(f'Successfully deleted {deletions} entries from \'{data_file_name}\'')

covid_positives = 0
covid_negatives = 0
xray_index = 0

Successfully deleted 195 images from the 'images' directory
Successfully deleted 195 entries from 'data_file.csv'


### Extract data from 'covid-chestray-dataset'

In [None]:
cohen_csv = pd.read_csv(cohen_csv_path, nrows=None)
print(cohen_csv.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in cohen_csv.iterrows():
    if row['folder'] == 'volumes' or row['view'] != 'PA':
        continue
    
    image = row['filename']
    
    src_path = f'{cohen_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding'].split('/')[-1]
    
    if finding == 'COVID-19':
        if row['RT_PCR_positive'] != 'Y':
            continue
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
        
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Extract data from 'Figure1-COVID-chestxray-dataset'

In [7]:
fig1_csv = pd.read_csv(fig1_csv_path, encoding='ISO-8859-1', nrows=None)
print(fig1_csv.shape)

fig1_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in fig1_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    patient_id = row['patientid']
    image = f'{patient_id}.jpg'
    
    src_path = f'{fig1_img_path}/{image}'
    if not os.path.exists(src_path):
        image = f'{patient_id}.png'
        src_path = f'{fig1_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

(55, 12)
x-ray-0.jpg : OK
x-ray-1.jpg : OK
x-ray-2.jpg : OK
x-ray-3.jpg : OK
x-ray-4.jpg : OK
x-ray-5.jpg : OK
x-ray-6.jpg : OK
x-ray-7.png : OK
x-ray-8.png : OK
x-ray-9.jpg : OK
x-ray-10.jpg : OK
x-ray-11.jpg : OK
x-ray-12.jpg : OK
x-ray-13.jpg : OK
x-ray-14.jpg : OK
x-ray-15.jpg : OK
x-ray-16.jpg : OK
x-ray-17.jpg : OK
x-ray-18.jpg : OK
x-ray-19.jpg : OK
x-ray-20.jpg : OK
x-ray-21.jpg : OK
x-ray-22.jpg : OK
x-ray-23.jpg : OK
x-ray-24.jpg : OK
x-ray-25.jpg : OK
x-ray-26.jpg : OK
x-ray-27.jpg : OK
x-ray-28.jpg : OK
x-ray-29.jpg : OK
x-ray-30.jpg : OK
x-ray-31.jpg : OK
x-ray-32.jpg : OK
x-ray-33.jpg : OK
x-ray-34.jpg : OK
x-ray-35.jpg : OK
x-ray-36.jpg : OK
x-ray-37.png : OK
x-ray-38.png : OK
x-ray-39.png : OK
Covid positives: 35
Covid negatives: 5


### Extract data from 'Actualmed-COVID-chestxray-dataset'

In [10]:
actmed_csv = pd.read_csv(actmed_csv_path, nrows=None)
print(actmed_csv.shape)

actmed_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in actmed_csv.iterrows():
    if row['finding'] == 'NaN' or row['view'] != 'PA':
        continue
    
    image = row['imagename']
    
    src_path = f'{actmed_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

(238, 13)
x-ray-40.png : OK
x-ray-41.png : OK
x-ray-42.png : OK
x-ray-43.png : OK
x-ray-44.png : OK
x-ray-45.png : OK
x-ray-46.png : OK
x-ray-47.png : OK
x-ray-48.png : OK
x-ray-49.png : OK
x-ray-50.png : OK
x-ray-51.png : OK
x-ray-52.png : OK
x-ray-53.png : OK
x-ray-54.png : OK
x-ray-55.png : OK
x-ray-56.png : OK
x-ray-57.png : OK
x-ray-58.png : OK
x-ray-59.png : OK
x-ray-60.png : OK
x-ray-61.png : OK
x-ray-62.png : OK
x-ray-63.png : OK
x-ray-64.png : OK
x-ray-65.png : OK
x-ray-66.png : OK
x-ray-67.png : OK
x-ray-68.png : OK
x-ray-69.png : OK
x-ray-70.png : OK
x-ray-71.png : OK
x-ray-72.png : OK
x-ray-73.png : OK
x-ray-74.png : OK
x-ray-75.png : OK
x-ray-76.png : OK
x-ray-77.png : OK
x-ray-78.png : OK
x-ray-79.png : OK
x-ray-80.png : OK
x-ray-81.png : OK
x-ray-82.png : OK
x-ray-83.png : OK
x-ray-84.png : OK
x-ray-85.png : OK
x-ray-86.png : OK
x-ray-87.png : OK
x-ray-88.png : OK
x-ray-89.png : OK
x-ray-90.png : OK
x-ray-91.png : OK
x-ray-92.png : OK
x-ray-93.png : OK
x-ray-94.png : OK


#### Due to potential duplicates and other uncertainties, the datasets that the below cells are exploiting aren't being used as of yet.

### Extract data from 'COVID-19-Radiography-Database'

In [None]:
sirm_cov_csv = pd.read_excel(sirm_cov_csv_path)
print(sirm_cov_csv.shape)

cohen_urls = set(cohen_csv['url'])

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in sirm_cov_csv.iterrows():
    if row['URL'] in cohen_urls:
        continue
    
    image_type = row['FORMAT'].lower()
    image = f"COVID ({row['FILE NAME'].split(' ')[1]}).{image_type}"
    
    src_path = f'./{sirm_cov_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    sample = new_image_name + ',' + 'positive' + '\n'
    covid_positives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

In [None]:
rsna_csv2 = pd.read_csv(rsna_csv2_path, nrows=None)
print(rsna_csv2.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in rsna_csv2.iterrows():
    patient_id = row['patientId']
    dcm_path = f'{rsna_img_path}/{patient_id}.dcm'
    
    ds = dicom.dcmread(dcm_path)
    pixel_array = ds.pixel_array
    
    new_image_name = f'x-ray-{xray_index}.png'
    dst_path = f'{images_dir}/{new_image_name}'
    
    cv2.imwrite(dst_path, pixel_array)
    
    sample = new_image_name + ',' + 'negative' + '\n'
    covid_negatives += 1
    
    data_file.write(sample)    
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Store data in Azure blob storage

In [11]:
from dotenv import load_dotenv
load_dotenv()

load_dotenv(verbose=True)

from pathlib import Path
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

import io

container_name = os.getenv('CONTAINERNAME')
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

try:
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)
    csv_blob_client = blob_service_client.get_blob_client(container=container_name, blob="Covid-19-1/data_file.csv")

    with open("dataset/data_file.csv", "rb") as data:
        csv_blob_client.upload_blob(data, overwrite=True)

    blob_deletions = 0

    img_blob_list = container_client.list_blobs(name_starts_with='Covid-19-1/images/')
    for blob in img_blob_list:
        img_blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob['name'])
        img_blob_client.delete_blob()
        blob_deletions += 1
    print(f'Successfully deleted {blob_deletions} image blobs')
    
    files = glob(f'{images_dir}/*')
    for f in files:
        img_name = f.split('/')[-1]
        img_blob_client = blob_service_client.get_blob_client(container=container_name, blob=f"Covid-19-1/images/{img_name}")

        image_content_setting = ContentSettings(content_type='image/jpeg')
        with open(f, "rb") as data:
            img_blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
        
        print(f'{img_name} : OK')

except Exception as ex:
    print('Exception:', ex)

Successfully deleted 268 image blobs
x-ray-0.jpg : OK
x-ray-1.jpg : OK
x-ray-10.jpg : OK
x-ray-100.png : OK
x-ray-101.png : OK
x-ray-102.png : OK
x-ray-103.png : OK
x-ray-104.png : OK
x-ray-105.png : OK
x-ray-106.png : OK
x-ray-107.png : OK
x-ray-108.png : OK
x-ray-109.png : OK
x-ray-11.jpg : OK
x-ray-110.png : OK
x-ray-111.png : OK
x-ray-112.png : OK
x-ray-113.png : OK
x-ray-114.png : OK
x-ray-115.png : OK
x-ray-116.png : OK
x-ray-117.png : OK
x-ray-118.png : OK
x-ray-119.png : OK
x-ray-12.jpg : OK
x-ray-120.png : OK
x-ray-121.png : OK
x-ray-122.png : OK
x-ray-123.png : OK
x-ray-124.png : OK
x-ray-125.png : OK
x-ray-126.png : OK
x-ray-127.png : OK
x-ray-128.png : OK
x-ray-129.png : OK
x-ray-13.jpg : OK
x-ray-130.png : OK
x-ray-131.png : OK
x-ray-132.png : OK
x-ray-133.png : OK
x-ray-134.png : OK
x-ray-135.png : OK
x-ray-136.png : OK
x-ray-137.png : OK
x-ray-138.png : OK
x-ray-139.png : OK
x-ray-14.jpg : OK
x-ray-140.png : OK
x-ray-141.png : OK
x-ray-142.png : OK
x-ray-143.png : OK
x-r