In [112]:
import numpy as np
import os
import random 
import shutil
import cv2
import urllib.request
import zipfile
import kaggle
import pandas as pd
import pydicom

In [28]:
RAW_DATA_PATH = '../data/raw/'

IEEE8023_DATASET_URL = 'https://github.com/ieee8023/covid-chestxray-dataset/archive/master.zip'
IEEE8023_DATASET_NAME = 'covid-chestxray-dataset'

FIGURE1_DATASET_URL = 'https://github.com/agchung/Figure1-COVID-chestxray-dataset/archive/master.zip'
FIGURE1_DATASET_NAME = 'figure1-dataset'

KAGGLE_DATASET_NAME = 'rsna-pneumonia-detection-challenge'

In [22]:
# TODO: Rename final folder
def download_dataset_zip(dataset_url, dataset_name):
    dataset_zip = dataset_name + '.zip'
    urllib.request.urlretrieve(dataset_url, RAW_DATA_PATH + dataset_zip)
    with zipfile.ZipFile(RAW_DATA_PATH + dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(RAW_DATA_PATH)
    os.remove(RAW_DATA_PATH + dataset_zip)
    
def download_kaggle_competition(dataset_name):
    dataset_path = RAW_DATA_PATH + dataset_name + '/'
    dataset_zip = dataset_name + '.zip'
    
    os.mkdir(dataset_path)
    kaggle.api.authenticate()
    kaggle.api.competition_download_files(dataset_name, path=dataset_path)
     
    with zipfile.ZipFile(dataset_path + dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(RAW_DATA_PATH)
    os.remove(dataset_path + dataset_zip)

In [21]:
download_dataset_zip(IEEE8023_DATASET_URL, IEEE8023_DATASET_NAME)

In [25]:
download_dataset_zip(FIGURE1_DATASET_URL, FIGURE1_DATASET_NAME)

In [None]:
download_kaggle_competition(KAGGLE_DATASET_NAME)

In [31]:
import kaggle

kaggle.api.authenticate()

kaggle.api.competition_download_files(KAGGLE_DATASET_NAME, path=RAW_DATA_PATH)



In [32]:
dataset_zip = KAGGLE_DATASET_NAME + '.zip'
with zipfile.ZipFile(RAW_DATA_PATH + dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(RAW_DATA_PATH)

In [33]:
# Readme
# * need to have kaggle api and kaggle.json

In [54]:
VIEW_TYPES = ['PA', 'AP']
MODALITY_TYPES = ['X-ray']
IGNORE_MAPPINGS = ['COVID-19, ARDS']

FINDING_MAPPING = {
    'COVID-19': 'covid19',
    'No Finding': 'normal'
}
DEFAULT_MAPPING = 'pneumonia'



In [52]:
dataset_path = RAW_DATA_PATH + IEEE8023_DATASET_NAME + '/'
dataset_meta_path = dataset_path + 'metadata.csv'

metadata = pd.read_csv(dataset_meta_path, nrows=None, usecols=['finding', 'view', 'modality', 'folder', 'filename'])
metadata

Unnamed: 0,finding,view,modality,folder,filename
0,COVID-19,PA,X-ray,images,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1,COVID-19,PA,X-ray,images,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2,COVID-19,PA,X-ray,images,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3,COVID-19,PA,X-ray,images,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4,COVID-19,PA,X-ray,images,nejmc2001573_f1a.jpeg
...,...,...,...,...,...
349,COVID-19,PA,X-ray,images,7-fatal-covid19.jpg
350,COVID-19,PA,X-ray,images,extubation-1.jpg
351,COVID-19,PA,X-ray,images,extubation-4.jpg
352,COVID-19,PA,X-ray,images,extubation-8.jpg


In [57]:
metadata_filter = metadata['view'].isin(VIEW_TYPES) & metadata['modality'].isin(MODALITY_TYPES) & ~metadata['finding'].isin(IGNORE_MAPPINGS)
metadata = metadata[metadata_filter]
metadata

Unnamed: 0,finding,view,modality,folder,filename
0,COVID-19,PA,X-ray,images,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1,COVID-19,PA,X-ray,images,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2,COVID-19,PA,X-ray,images,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3,COVID-19,PA,X-ray,images,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4,COVID-19,PA,X-ray,images,nejmc2001573_f1a.jpeg
...,...,...,...,...,...
349,COVID-19,PA,X-ray,images,7-fatal-covid19.jpg
350,COVID-19,PA,X-ray,images,extubation-1.jpg
351,COVID-19,PA,X-ray,images,extubation-4.jpg
352,COVID-19,PA,X-ray,images,extubation-8.jpg


In [72]:
covid_images = []
normal_images = []
pneumonia_images = []

dataset_path = RAW_DATA_PATH + IEEE8023_DATASET_NAME + '/'

for index, row in metadata.iterrows():
    image_path = dataset_path + row['folder'] + '/' + row['filename']
    if row['finding'] == 'COVID-19':
        covid_images.append((image_path, row['filename']))
    elif row['finding'] == 'No Finding':
        normal_images.append((image_path, row['filename']))
    else:
        pneumonia_images.append((image_path, row['filename']))

pneumonia_images     

[('../data/raw/covid-chestxray-dataset/images/ARDSSevere.png',
  'ARDSSevere.png'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g0-Fig8a-day0.jpeg',
  'SARS-10.1148rg.242035193-g04mr34g0-Fig8a-day0.jpeg'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g0-Fig8b-day5.jpeg',
  'SARS-10.1148rg.242035193-g04mr34g0-Fig8b-day5.jpeg'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g0-Fig8c-day10.jpeg',
  'SARS-10.1148rg.242035193-g04mr34g0-Fig8c-day10.jpeg'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g04a-Fig4a-day7.jpeg',
  'SARS-10.1148rg.242035193-g04mr34g04a-Fig4a-day7.jpeg'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g04b-Fig4b-day12.jpeg',
  'SARS-10.1148rg.242035193-g04mr34g04b-Fig4b-day12.jpeg'),
 ('../data/raw/covid-chestxray-dataset/images/SARS-10.1148rg.242035193-g04mr34g05x-Fig5-day9.jpeg',
  'SARS-10.1148rg.242035

In [83]:
PROCESSED_DATA_PATH = '../data/processed/'

def copy_images(class_type, images):
    target = PROCESSED_DATA_PATH + class_type + '/'
    for original_path, original_name in images:
        shutil.copyfile(original_path, target + original_name)


In [84]:
copy_images('covid19', covid_images)

In [85]:
copy_images('normal', normal_images)

In [86]:
copy_images('pneumonia', pneumonia_images)

In [92]:
dataset_path = RAW_DATA_PATH + FIGURE1_DATASET_NAME + '/'
dataset_meta_path = dataset_path + 'metadata.csv'

metadata = pd.read_csv(dataset_meta_path, encoding='ISO-8859-1', nrows=None)
metadata_filter = metadata['finding'].eq('COVID-19')
metadata = metadata[metadata_filter]
metadata

Unnamed: 0,patientid,offset,sex,age,finding,survival,temperature,pO2 saturation,view,modality,artifacts/distortion,notes
0,COVID-00001,13.0,M,33,COVID-19,,,58,AP erect,X-ray,,O2 saturation was initially 58% on room air an...
4,COVID-00004,2.0,M,42,COVID-19,,,91-92,PA,X-ray,,42 year old male patient presented to ED with ...
6,COVID-00006,5.0,F,57,COVID-19,,,,,X-ray,Some from monitor,57 year old female. Returning from Texas March...
8,COVID-00008,4.0,M,47,COVID-19,,,92,AP erect,X-ray,"Some from monitor, dust specks, cropped out mo...","47 Male, BMI 34, sleep apnea and childhood ast..."
9,COVID-00009,,,,COVID-19,,,,,X-ray,"Some from monitor, oversaturation at bottom",CXray during worsening corona infection
11,COVID-00011,12.0,F,35,COVID-19,,,,,X-ray,,35 year-old G3P1001 at 34 weeks presents with ...
12,COVID-00012,,M,52,COVID-19,,,86,,X-ray,,"A30 A 52-year-old male patient, insulin depend..."
16,COVID-00015a,,,67+,COVID-19,,,,,X-ray,,Patient above retirement age with no visible s...
17,COVID-00015b,,,67+,COVID-19,,,,,X-ray,,Image was taken 3 days after patient had been ...
18,COVID-00016,16.0,M,64,COVID-19,,,low 90s,,X-ray,Slight from monitor,"64 yo M w/ HTN, DM2, OSA admitted with cough, ..."


In [99]:
covid_images = []

for index, row in metadata.iterrows():
    extension = '.jpg' if row['patientid'] not in ['COVID-00015a', 'COVID-00015b'] else '.png'
    image_path = dataset_path + 'images/' + row['patientid'] + extension
    covid_images.append((image_path, row['patientid'] + extension))
    
covid_images

[('../data/raw/figure1-dataset/images/COVID-00001.jpg', 'COVID-00001'),
 ('../data/raw/figure1-dataset/images/COVID-00004.jpg', 'COVID-00004'),
 ('../data/raw/figure1-dataset/images/COVID-00006.jpg', 'COVID-00006'),
 ('../data/raw/figure1-dataset/images/COVID-00008.jpg', 'COVID-00008'),
 ('../data/raw/figure1-dataset/images/COVID-00009.jpg', 'COVID-00009'),
 ('../data/raw/figure1-dataset/images/COVID-00011.jpg', 'COVID-00011'),
 ('../data/raw/figure1-dataset/images/COVID-00012.jpg', 'COVID-00012'),
 ('../data/raw/figure1-dataset/images/COVID-00015a.png', 'COVID-00015a'),
 ('../data/raw/figure1-dataset/images/COVID-00015b.png', 'COVID-00015b'),
 ('../data/raw/figure1-dataset/images/COVID-00016.jpg', 'COVID-00016'),
 ('../data/raw/figure1-dataset/images/COVID-00017.jpg', 'COVID-00017'),
 ('../data/raw/figure1-dataset/images/COVID-00018.jpg', 'COVID-00018'),
 ('../data/raw/figure1-dataset/images/COVID-00020.jpg', 'COVID-00020'),
 ('../data/raw/figure1-dataset/images/COVID-00021.jpg', 'COV

In [100]:
copy_images('covid19', covid_images)

In [105]:
dataset_path = RAW_DATA_PATH + KAGGLE_DATASET_NAME + '/'
dataset_meta_path = dataset_path + 'stage_2_detailed_class_info.csv'

metadata = pd.read_csv(dataset_meta_path, nrows=None)
metadata_filter = metadata['class'].eq('Normal')
metadata = metadata[metadata_filter].head(1000)
metadata

Unnamed: 0,patientId,class
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,Normal
11,009482dc-3db5-48d4-8580-5c89c4f01334,Normal
12,009eb222-eabc-4150-8121-d5a6d06b8ebf,Normal
13,00a85be6-6eb0-421d-8acf-ff2dc0007e8a,Normal
21,00f87de5-5fe0-4921-93ea-914d7e683266,Normal
...,...,...
3921,3b8b8777-a1f6-4384-872a-28b95f59bf0d,Normal
3922,3b8cd35a-1eed-4056-9191-5710a9081fed,Normal
3933,3ba03324-b1b1-4390-bfea-5989348ce1e8,Normal
3934,3ba4f6d7-e386-444e-8e99-6ae2e9f06888,Normal


In [118]:
normal_images = []

for index, row in metadata.iterrows():
    image_path = dataset_path + 'stage_2_train_images/' + row['patientId'] + '.dcm'
    normal_images.append((image_path, row['patientId']))
    
normal_images

[('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/003d8fa0-6bf1-40ed-b54c-ac657f8495c5.dcm',
  '003d8fa0-6bf1-40ed-b54c-ac657f8495c5'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/009482dc-3db5-48d4-8580-5c89c4f01334.dcm',
  '009482dc-3db5-48d4-8580-5c89c4f01334'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/009eb222-eabc-4150-8121-d5a6d06b8ebf.dcm',
  '009eb222-eabc-4150-8121-d5a6d06b8ebf'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00a85be6-6eb0-421d-8acf-ff2dc0007e8a.dcm',
  '00a85be6-6eb0-421d-8acf-ff2dc0007e8a'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00f87de5-5fe0-4921-93ea-914d7e683266.dcm',
  '00f87de5-5fe0-4921-93ea-914d7e683266'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/01027bc3-dc40-4165-a6c3-d6be2cb7ca34.dcm',
  '01027bc3-dc40-4165-a6c3-d6be2cb7ca34'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_

In [121]:
PROCESSED_DATA_PATH = '../data/processed/'

def copy_dcm_images(class_type, images):
    target = PROCESSED_DATA_PATH + class_type + '/'
    for original_path, original_name in images:
        ds = pydicom.filereader.dcmread(original_path)
        pixel_array_numpy = ds.pixel_array
        
        target_image_name = original_name + '.png'
        cv2.imwrite(target + target_image_name, pixel_array_numpy)

In [122]:
copy_dcm_images('normal', normal_images)

In [124]:
dataset_path = RAW_DATA_PATH + KAGGLE_DATASET_NAME + '/'
dataset_meta_path = dataset_path + 'stage_2_train_labels.csv'

metadata = pd.read_csv(dataset_meta_path, nrows=None)
metadata_filter = metadata['Target'].eq(1)
metadata = metadata[metadata_filter].head(1000)
metadata

Unnamed: 0,patientId,x,y,width,height,Target
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1
5,00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1
8,00704310-78a8-4b38-8475-49f4573b2dbb,323.0,577.0,160.0,104.0,1
9,00704310-78a8-4b38-8475-49f4573b2dbb,695.0,575.0,162.0,137.0,1
14,00aecb01-a116-45a2-956c-08d2fa55433f,288.0,322.0,94.0,135.0,1
...,...,...,...,...,...,...
2311,30d3d2f9-4342-4ab4-9032-34d894c2cfbb,673.0,522.0,73.0,86.0,1
2312,30d3d2f9-4342-4ab4-9032-34d894c2cfbb,256.0,546.0,98.0,87.0,1
2313,30d3d2f9-4342-4ab4-9032-34d894c2cfbb,317.0,327.0,119.0,143.0,1
2315,31102758-a5ba-44c9-b480-b744bcfcb861,526.0,699.0,147.0,198.0,1


In [125]:
pneumonia_images = []

for index, row in metadata.iterrows():
    image_path = dataset_path + 'stage_2_train_images/' + row['patientId'] + '.dcm'
    pneumonia_images.append((image_path, row['patientId']))
    
pneumonia_images

[('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00436515-870c-4b36-a041-de91049b9ab4.dcm',
  '00436515-870c-4b36-a041-de91049b9ab4'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00436515-870c-4b36-a041-de91049b9ab4.dcm',
  '00436515-870c-4b36-a041-de91049b9ab4'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00704310-78a8-4b38-8475-49f4573b2dbb.dcm',
  '00704310-78a8-4b38-8475-49f4573b2dbb'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00704310-78a8-4b38-8475-49f4573b2dbb.dcm',
  '00704310-78a8-4b38-8475-49f4573b2dbb'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00aecb01-a116-45a2-956c-08d2fa55433f.dcm',
  '00aecb01-a116-45a2-956c-08d2fa55433f'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_train_images/00aecb01-a116-45a2-956c-08d2fa55433f.dcm',
  '00aecb01-a116-45a2-956c-08d2fa55433f'),
 ('../data/raw/rsna-pneumonia-detection-challenge/stage_2_

In [126]:
copy_dcm_images('pneumonia', pneumonia_images)