<a href="https://colab.research.google.com/github/issondl/from-data-to-solution-2021/blob/main/0_data_preparation_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preparation #1

Useful resources:

* Used dataset:
    * [NIH Clinical Center provides one of the largest publicly available chest x-ray datasets to scientific community](https://www.nih.gov/news-events/news-releases/nih-clinical-center-provides-one-largest-publicly-available-chest-x-ray-datasets-scientific-community)
    * [NIH CC ChestXRay Dataset (box.com)](https://nihcc.app.box.com/v/ChestXray-NIHCC)
    * [NIH CC ChestXRay Dataset (Google Cloud)](https://cloud.google.com/healthcare/docs/resources/public-datasets/nih-chest)
* Datasets:
    * [Google Dataset Search](https://datasetsearch.research.google.com/)
    * [Awesome Public Datsets on GitHub](https://github.com/awesomedata/awesome-public-datasets)
* Pandas:
    * [10 minutes to Pandas tutorial](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html)

## Imports

In [None]:
import os
import urllib.request
from multiprocessing import Pool, cpu_count

import pandas as pd
import tqdm
from PIL import Image

## Constants

In [None]:
#  _______________________________________
# / Set 'DOWNLOAD_IMAGES' to True only if \
# \ time you have.                        /
#  ---------------------------------------
#    \
#     \        .
#      .---.  //
#     Y|o o|Y//
#    /_(i=i)K/
#    ~()~*~()~
#     (_)-(_)

#      Darth
#      Vader
#      koala
DOWNLOAD_IMAGES = False

DATA_DIR = 'data/'
IMAGES_DIR = os.path.join(DATA_DIR, 'images')
IMAGE_NEW_RESOLUTION = (256, 256)
ORIGINAL_CSV_FILE = os.path.join(DATA_DIR, 'Data_Entry_2017_v2020.csv')

# Links to archives containing original NIH Clinical Center ChestX-ray8 dataset
LINKS = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
    'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
    'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
    'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
    'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
    'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
    'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
    'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]

## Download and unpack the dataset

In [None]:
def download(idx_link):
    idx, link = idx_link
    fn = 'images_%02d.tar.gz' % (idx+1)
    if os.path.exists(os.path.join(DATA_DIR, fn)):
        print('File {} already exists. Skipping...'.format(fn))
        return None
    print('downloading {}...'.format(fn))
    urllib.request.urlretrieve(link, os.path.join(DATA_DIR, fn))

In [None]:
print('Using {} threads'.format(cpu_count()))

os.makedirs(DATA_DIR, exist_ok=True)

if DOWNLOAD_IMAGES:
    with Pool(cpu_count()) as p:
        r = list(
            tqdm.tqdm(
                p.imap(download, zip(list(range(len(LINKS))), LINKS)), 
                total=len(LINKS)
            )
        )

In [None]:
if DOWNLOAD_IMAGES:
    archives = [os.path.join(DATA_DIR, archive) for archive in os.listdir(DATA_DIR) if archive.startswith('images_') and archive.endswith('.tar.gz')]

    if not os.path.exists(IMAGES_DIR):
        pbar = tqdm.tqdm(archives)
        for archive in pbar:
            pbar.set_description('Unpacking {}'.format(archive), refresh=True)
            ! tar -xzf $archive -C $DATA_DIR
    else:
        print('{} exists, skipping unpacking'.format(IMAGES_DIR))

In [None]:
if not os.path.exists(ORIGINAL_CSV_FILE):
    ! gdown --id 1FVTrdLVjPvJKtU60a75Z119aABOy-YbB -O $ORIGINAL_CSV_FILE
else:
    print('Original CSV file ({}) already exists.'.format(ORIGINAL_CSV_FILE))

## Explore the dataset

Tasks:

1. Load the CSV file and display its content.
1. Explore the dataset. Check the `Finding Labels` column.

In [None]:
## Load the CSV file and display its content


In [None]:
## Explore the dataset. Check the Finding Labels column.


## Decrease number of samples

Tasks:

1. Remove samples with multiple labels (hint: multiple labels are divided with |)
1. Remove samples with labels from "V2" version of the dataset. **Keep the `No Finding` category!**
1. Randomly drop 50.000 samples with `No Finding` label

### Remove samples with multiple labels (hint: multiple labels are divided with |)

In [None]:
## Remove samples with multiple labels (hint: multiple labels are divided with |)


### Remove samples with `findings_v2` labels

In [None]:
FINDINGS_V1 = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax']
FINDINGS_V2 = ['Hernia', 'Emphysema', 'Pleural_Thickening', 'Fibrosis', 'Consolidation', 'Edema']
              # We will keep 'No Finding' class of the 'v2' dataset for this workshop

In [None]:
## Remove samples with labels from "V2" version of the dataset. Keep the No Finding category!


### Randomly drop 50.000 samples with `No Finding` label

In [None]:
## Randomly drop 50.000 samples with `No Finding` label


## Sync DF with images

In [None]:
def sync_files_csv():
    images = {f for f in os.listdir(IMAGES_DIR) if f.endswith('.png')}
    image_index = set(df['Image Index'])
    print('sync_files_csv - Before images: {} ; Before CSV: {}'.format(len(images), len(df)))
    
    diff = list(images.symmetric_difference(image_index))
    print('images - image_index: {}'.format(len(diff)))
    
    for image in tqdm.tqdm(diff):
        try:
            os.remove(os.path.join(IMAGES_DIR, image))
        except:
            pass
        df.drop(df[df['Image Index'].str.match(image)].index, inplace=True)
    
    images = {f for f in os.listdir(IMAGES_DIR) if f.endswith('.png')}
    print('sync_files_csv - After images: {} ; After CSV: {}'.format(len(images), len(df)))

if DOWNLOAD_IMAGES:
    sync_files_csv()

## Resize images

In [None]:
def resize(img_path):
    try:
        img = Image.open(img_path)
        img_resized = img.resize(IMAGE_NEW_RESOLUTION)
        img_resized.save(img_path, "PNG")
    except Exception as ex:
        print('Error when processing {}. Removing image. Exception: {}'.format(img_name, ex))
        os.remove(img_path)

In [None]:
if DOWNLOAD_IMAGES:
    images = [os.path.join(IMAGES_DIR, img_name) for img_name in os.listdir(IMAGES_DIR) if img_name.endswith('.png')]

    print('Resizing images to {}'.format(IMAGE_NEW_RESOLUTION))

    with Pool(cpu_count()) as p:
        r = list(
            tqdm.tqdm(
                p.imap(resize, images),
                total=len(images)
            )
        )

## Save results

In [None]:
if DOWNLOAD_IMAGES:
    archive = os.path.join(DATA_DIR, 'nih_chest_xray_single_9c_{}x{}.tar.gz'.format(IMAGE_NEW_RESOLUTION[0], IMAGE_NEW_RESOLUTION[1]))

    if not os.path.exists(archive):
        ! tar -czf $archive $IMAGES_DIR
    else:
        print('Archive {} already exists'.format(archive))
    
    df.reindex()
    df.to_csv(os.path.join(DATA_DIR, 'nih_chest_xray_single_9c.csv'))