In [2]:
import numpy as np
import pandas as pd
import os
import shutil
from PIL import Image
from io import BytesIO

Filters for posterior anterior images and bins by respiratory diseases based on metadata provided by the Github dataset. Additionally converts images to JPEG format.

Source: https://github.com/ieee8023/covid-chestxray-dataset

In [12]:
# Load and explore metadata
df = pd.read_csv('data/github/metadata.csv')
columns = df.columns.values
disease_types = df.finding.unique()

print(columns)
print(disease_types)

['patientid' 'offset' 'sex' 'age' 'finding' 'survival' 'intubated'
 'intubation_present' 'went_icu' 'needed_supplemental_O2' 'extubated'
 'temperature' 'pO2_saturation' 'leukocyte_count' 'neutrophil_count'
 'lymphocyte_count' 'view' 'modality' 'date' 'location' 'folder'
 'filename' 'doi' 'url' 'license' 'clinical_notes' 'other_notes'
 'Unnamed: 27']
['COVID-19' 'ARDS' 'SARS' 'Pneumocystis' 'Streptococcus' 'No Finding'
 'Chlamydophila' 'E.Coli' 'COVID-19, ARDS' 'Klebsiella' 'Legionella']


In [14]:
# Make folders and add all PA images
master_dir = 'data/master'
source_dir = 'data/github/images'
for class_type in disease_types:
    master_path = os.path.join(master_dir, class_type)
    if not os.path.isdir(master_path):
        os.mkdir(master_path)
    for index, row in df.iterrows():
        if row['finding'] == class_type and row['view'] == 'PA':
            source_path = os.path.join(source_dir, row['filename'])
            shutil.copy(source_path, master_path)
            

In [15]:
# Convert png into jpeg
for class_type in df.finding.unique():
    master_path = os.path.join(master_dir, class_type)
    for image in os.listdir(master_path):
        image_path = os.path.join(master_path, image)
        
        if ".png" in image or ".jpg" in image:
            ima = Image.open(image_path)
            rgb_im = ima.convert('RGB')
            rgb_im.save(image_path[:-3] + 'jpeg')
            os.remove(image_path)
            

In [16]:
#convert all images from source_dir to JPEG and save in save_dir
def conv2JPEG(source_dir, save_dir):
    if (source_dir == save_dir):
        raise Error
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    for image in os.listdir(source_dir):
        image_path = os.path.join(source_dir, image)
        save_image_path = os.path.join(save_dir, image)
        if ".png" in image or ".jpg" in image:
            ima = Image.open(image_path)
            rgb_im = ima.convert('RGB')
            rgb_im.save(save_image_path[:-3] + 'jpeg')
            os.remove(image_path)
            

In [18]:
datasets=["data/NLM-MontgomeryCXRSet", "data/ChinaSet_AllFiles"]
disease_types=['Normal', "TB"]
for dataset in datasets:
    master_path = os.path.join(dataset, "master")
    if not os.path.isdir(master_path):
        os.mkdir(master_path)
    for disease in disease_types:
        source_path = os.path.join(dataset, "CXR_png", disease)
        save_path = os.path.join(master_path,disease)
        if not os.path.isdir(save_path):
            os.mkdir(save_path)
        print(source_path,len(os.listdir(source_path)), save_path)
        conv2JPEG(source_path,save_path)
        print(len(os.listdir(save_path)))
        

data/NLM-MontgomeryCXRSet/CXR_png/Normal 73 data/NLM-MontgomeryCXRSet/master/Normal
80
data/NLM-MontgomeryCXRSet/CXR_png/TB 58 data/NLM-MontgomeryCXRSet/master/TB
58
data/ChinaSet_AllFiles/CXR_png/Normal 326 data/ChinaSet_AllFiles/master/Normal
326
data/ChinaSet_AllFiles/CXR_png/TB 336 data/ChinaSet_AllFiles/master/TB
336
