### Import libraries

In [1]:
import sys
!{sys.executable} -m pip install pydicom



In [2]:
!{sys.executable} -m pip install python-dotenv



In [3]:
!{sys.executable} -m pip install opencv-python



In [4]:
import pandas as pd
import os
import shutil
from glob import glob
import pydicom as dicom
import cv2
import uuid
from PIL import Image
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, ContentSettings, __version__

In [None]:
cohen_dir = 'covid-chestxray-dataset'
cohen_csv_path = f'./{cohen_dir}/metadata.csv'
cohen_img_path = f'./{cohen_dir}/images'

In [None]:
fig1_dir = 'Figure1-COVID-chestxray-dataset'
fig1_csv_path = f'./{fig1_dir}/metadata.csv'
fig1_img_path = f'./{fig1_dir}/images'

In [None]:
actmed_dir = 'Actualmed-COVID-chestxray-dataset'
actmed_csv_path = f'./{actmed_dir}/metadata.csv'
actmed_img_path = f'./{actmed_dir}/images'

In [5]:
sirm_dir = 'COVID-19-Radiography-Database'
#sirm_cov_csv_path = f'./{sirm_dir}/COVID.metadata.xlsx'
sirm_cov_pos_path = f'./{sirm_dir}/COVID'
sirm_cov_neg_path = f'./{sirm_dir}/NORMAL'

In [None]:
rsna_csv1_path = './stage_2_detailed_class_info.csv'
rsna_csv2_path = './stage_2_train_labels.csv'
rsna_img_path = './stage_2_train_images'

### Initialize directories, files and metrics variables

In [16]:
dataset_dir = 'dataset'

images_dir = f'{dataset_dir}/images'
data_file_name = 'data_file.csv'

deletions = 0

if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)
    print('Successfully created \'dataset\' directory created.')

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print('Successfully created \'images\' directory created.')
else:
    files = glob(f'{images_dir}/*')
    for f in files:
        try:
            os.remove(f)
            deletions += 1
        except OSError as e:
            print('Error: %s : %s' % (f, e.strerror))
    print(f'Successfully deleted {deletions} images from the \'images\' directory')

data_file = open(f'./{dataset_dir}/{data_file_name}', 'w')
data_file.write('filename,diagnosis\n')
data_file.close()
print(f'Successfully deleted {deletions} entries from \'{data_file_name}\'')

covid_positives = 0
covid_negatives = 0
xray_index = 0

Successfully deleted 2541 images from the 'images' directory
Successfully deleted 2541 entries from 'data_file.csv'


### Extract data from 'covid-chestray-dataset'

In [None]:
cohen_csv = pd.read_csv(cohen_csv_path, nrows=None)
print(cohen_csv.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in cohen_csv.iterrows():
    if row['folder'] == 'volumes' or row['view'] != 'PA':
        continue
    
    image = row['filename']
    
    src_path = f'{cohen_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding'].split('/')[-1]
    
    if finding == 'COVID-19':
        if row['RT_PCR_positive'] != 'Y':
            continue
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
        
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Extract data from 'Figure1-COVID-chestxray-dataset'

In [None]:
fig1_csv = pd.read_csv(fig1_csv_path, encoding='ISO-8859-1', nrows=None)
print(fig1_csv.shape)

fig1_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in fig1_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    patient_id = row['patientid']
    image = f'{patient_id}.jpg'
    
    src_path = f'{fig1_img_path}/{image}'
    if not os.path.exists(src_path):
        image = f'{patient_id}.png'
        src_path = f'{fig1_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Extract data from 'Actualmed-COVID-chestxray-dataset'

In [None]:
actmed_csv = pd.read_csv(actmed_csv_path, nrows=None)
print(actmed_csv.shape)

actmed_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in actmed_csv.iterrows():
    if row['finding'] == 'NaN' or row['view'] != 'PA':
        continue
    
    image = row['imagename']
    
    src_path = f'{actmed_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

#### Due to potential duplicates and other uncertainties, the datasets that the below cells are exploiting aren't being used as of yet.

### Extract data from 'COVID-19-Radiography-Database' (positives)

In [13]:
data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

positive_imgs = glob(f'{sirm_cov_pos_path}/*')

for img_path in positive_imgs:
    img_name = img_path.split('/')[-1]
    img_type = img_name.split('.')[-1]

    dst_path = f'{images_dir}/{img_name}'
    shutil.copy(img_path, dst_path)

    new_img_name = f'x-ray-{xray_index}.{img_type}'
    os.rename(dst_path, f'./{images_dir}/{new_img_name}')

    sample = new_img_name + ',' + 'positive' + '\n'
    covid_positives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_img_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

x-ray-0.png : OK
x-ray-1.png : OK
x-ray-2.png : OK
x-ray-3.png : OK
x-ray-4.png : OK
x-ray-5.png : OK
x-ray-6.png : OK
x-ray-7.png : OK
x-ray-8.png : OK
x-ray-9.png : OK
x-ray-10.png : OK
x-ray-11.png : OK
x-ray-12.png : OK
x-ray-13.png : OK
x-ray-14.png : OK
x-ray-15.png : OK
x-ray-16.png : OK
x-ray-17.png : OK
x-ray-18.png : OK
x-ray-19.png : OK
x-ray-20.png : OK
x-ray-21.png : OK
x-ray-22.png : OK
x-ray-23.png : OK
x-ray-24.png : OK
x-ray-25.png : OK
x-ray-26.png : OK
x-ray-27.png : OK
x-ray-28.png : OK
x-ray-29.png : OK
x-ray-30.png : OK
x-ray-31.png : OK
x-ray-32.png : OK
x-ray-33.png : OK
x-ray-34.png : OK
x-ray-35.png : OK
x-ray-36.png : OK
x-ray-37.png : OK
x-ray-38.png : OK
x-ray-39.png : OK
x-ray-40.png : OK
x-ray-41.png : OK
x-ray-42.png : OK
x-ray-43.png : OK
x-ray-44.png : OK
x-ray-45.png : OK
x-ray-46.png : OK
x-ray-47.png : OK
x-ray-48.png : OK
x-ray-49.png : OK
x-ray-50.png : OK
x-ray-51.png : OK
x-ray-52.png : OK
x-ray-53.png : OK
x-ray-54.png : OK
x-ray-55.png : OK
x-

### Extract data from 'COVID-19-Radiography-Database' (negatives)

In [14]:
data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

negative_imgs = glob(f'{sirm_cov_neg_path}/*')

for img_path in negative_imgs:
    img_name = img_path.split('/')[-1]
    img_type = img_name.split('.')[-1]

    dst_path = f'{images_dir}/{img_name}'
    shutil.copy(img_path, dst_path)

    new_img_name = f'x-ray-{xray_index}.{img_type}'
    os.rename(dst_path, f'./{images_dir}/{new_img_name}')

    sample = new_img_name + ',' + 'negative' + '\n'
    covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_img_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

x-ray-1200.png : OK
x-ray-1201.png : OK
x-ray-1202.png : OK
x-ray-1203.png : OK
x-ray-1204.png : OK
x-ray-1205.png : OK
x-ray-1206.png : OK
x-ray-1207.png : OK
x-ray-1208.png : OK
x-ray-1209.png : OK
x-ray-1210.png : OK
x-ray-1211.png : OK
x-ray-1212.png : OK
x-ray-1213.png : OK
x-ray-1214.png : OK
x-ray-1215.png : OK
x-ray-1216.png : OK
x-ray-1217.png : OK
x-ray-1218.png : OK
x-ray-1219.png : OK
x-ray-1220.png : OK
x-ray-1221.png : OK
x-ray-1222.png : OK
x-ray-1223.png : OK
x-ray-1224.png : OK
x-ray-1225.png : OK
x-ray-1226.png : OK
x-ray-1227.png : OK
x-ray-1228.png : OK
x-ray-1229.png : OK
x-ray-1230.png : OK
x-ray-1231.png : OK
x-ray-1232.png : OK
x-ray-1233.png : OK
x-ray-1234.png : OK
x-ray-1235.png : OK
x-ray-1236.png : OK
x-ray-1237.png : OK
x-ray-1238.png : OK
x-ray-1239.png : OK
x-ray-1240.png : OK
x-ray-1241.png : OK
x-ray-1242.png : OK
x-ray-1243.png : OK
x-ray-1244.png : OK
x-ray-1245.png : OK
x-ray-1246.png : OK
x-ray-1247.png : OK
x-ray-1248.png : OK
x-ray-1249.png : OK


In [None]:
rsna_csv2 = pd.read_csv(rsna_csv2_path, nrows=None)
print(rsna_csv2.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in rsna_csv2.iterrows():
    patient_id = row['patientId']
    dcm_path = f'{rsna_img_path}/{patient_id}.dcm'
    
    ds = dicom.dcmread(dcm_path)
    pixel_array = ds.pixel_array
    
    new_image_name = f'x-ray-{xray_index}.png'
    dst_path = f'{images_dir}/{new_image_name}'
    
    cv2.imwrite(dst_path, pixel_array)
    
    sample = new_image_name + ',' + 'negative' + '\n'
    covid_negatives += 1
    
    data_file.write(sample)    
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Store data in Azure blob storage

In [15]:
from dotenv import load_dotenv
load_dotenv()

load_dotenv(verbose=True)

from pathlib import Path
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

import io

container_name = os.getenv('CONTAINERNAME')
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')

try:
    blob_service_client = BlobServiceClient.from_connection_string(connect_str)
    container_client = blob_service_client.get_container_client(container_name)
    csv_blob_client = blob_service_client.get_blob_client(container=container_name, blob="Covid-19-1/data_file.csv")

    with open("dataset/data_file.csv", "rb") as data:
        csv_blob_client.upload_blob(data, overwrite=True)

    blob_deletions = 0

    img_blob_list = container_client.list_blobs(name_starts_with='Covid-19-1/images/')
    for blob in img_blob_list:
        img_blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob['name'])
        img_blob_client.delete_blob()
        blob_deletions += 1
    print(f'Successfully deleted {blob_deletions} image blobs')
    
    files = glob(f'{images_dir}/*')
    for f in files:
        img_name = f.split('/')[-1]
        img_blob_client = blob_service_client.get_blob_client(container=container_name, blob=f"Covid-19-1/images/{img_name}")

        image_content_setting = ContentSettings(content_type='image/jpeg')
        with open(f, "rb") as data:
            img_blob_client.upload_blob(data, overwrite=True, content_settings=image_content_setting)
        
        print(f'{img_name} : OK')

except Exception as ex:
    print('Exception:', ex)

Successfully deleted 463 image blobs
x-ray-0.png : OK
x-ray-1.png : OK
x-ray-10.png : OK
x-ray-100.png : OK
x-ray-1000.png : OK
x-ray-1001.png : OK
x-ray-1002.png : OK
x-ray-1003.png : OK
x-ray-1004.png : OK
x-ray-1005.png : OK
x-ray-1006.png : OK
x-ray-1007.png : OK
x-ray-1008.png : OK
x-ray-1009.png : OK
x-ray-101.png : OK
x-ray-1010.png : OK
x-ray-1011.png : OK
x-ray-1012.png : OK
x-ray-1013.png : OK
x-ray-1014.png : OK
x-ray-1015.png : OK
x-ray-1016.png : OK
x-ray-1017.png : OK
x-ray-1018.png : OK
x-ray-1019.png : OK
x-ray-102.png : OK
x-ray-1020.png : OK
x-ray-1021.png : OK
x-ray-1022.png : OK
x-ray-1023.png : OK
x-ray-1024.png : OK
x-ray-1025.png : OK
x-ray-1026.png : OK
x-ray-1027.png : OK
x-ray-1028.png : OK
x-ray-1029.png : OK
x-ray-103.png : OK
x-ray-1030.png : OK
x-ray-1031.png : OK
x-ray-1032.png : OK
x-ray-1033.png : OK
x-ray-1034.png : OK
x-ray-1035.png : OK
x-ray-1036.png : OK
x-ray-1037.png : OK
x-ray-1038.png : OK
x-ray-1039.png : OK
x-ray-104.png : OK
x-ray-1040.png :