## Data Preparation

This notebook processes the data that the model is trained on, and is called at the beginning of each model.

In [1]:
import numpy as np  # linear alg
import pandas as pd  # data processing 
import os
from glob import glob  # file searching
%matplotlib inline
import matplotlib.pyplot as plt  # plotting

In [3]:
# Setting up access to the images and csv

PATH = os.path.abspath('../../data')
SOURCE_IMAGES = os.path.join(PATH, "images")
images = glob(os.path.join(SOURCE_IMAGES, "*.png"))
#xray_labels_df = pd.read_csv(os.path.join(PATH, 'sample_labels.csv'))
xray_labels_df = pd.read_csv(os.path.join(PATH, 'Data_Entry_2017.csv'))

In [5]:
all_image_paths = {os.path.basename(x): x for x in images}
image_names = [os.path.basename(x) for x in images]

# Removes rows from dataframe that do not have a corresponding image in the images folder.
xray_labels_df = xray_labels_df[xray_labels_df['Image Index'].isin(image_names)]

# Drop images with multiple findings 
xray_labels_df = xray_labels_df[~xray_labels_df['Finding Labels'].str.contains('\|')]
print('Number of finding labels: {}'.format(len(xray_labels_df['Finding Labels'])))

# Add the path to all images to the dataframe
xray_labels_df['path'] = xray_labels_df['Image Index'].map(all_image_paths.get)
#xray_labels_df.sample(3)
xray_labels_df.head()

Number of finding labels: 16297


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
84999,00020945_050.png,Infiltration,50,20945,34,M,AP,3056,2544,0.139,0.139,,/Users/aculanay/Desktop/Spring 2018/CSC665-AI/...
85000,00020945_051.png,Infiltration,51,20945,34,M,AP,3056,2544,0.139,0.139,,/Users/aculanay/Desktop/Spring 2018/CSC665-AI/...
85001,00020945_052.png,No Finding,52,20945,34,M,AP,3056,2544,0.139,0.139,,/Users/aculanay/Desktop/Spring 2018/CSC665-AI/...
85002,00020945_053.png,No Finding,53,20945,34,M,AP,3056,2544,0.139,0.139,,/Users/aculanay/Desktop/Spring 2018/CSC665-AI/...
85003,00020945_054.png,No Finding,54,20945,34,M,AP,3056,2544,0.139,0.139,,/Users/aculanay/Desktop/Spring 2018/CSC665-AI/...


In [6]:
from itertools import chain

# Replace Finding Labels with no finding to be blank
xray_labels_df['Finding Labels'] = xray_labels_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))

# Add columns to for each diagnosis to df
all_labels = np.unique(list(chain(*xray_labels_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x) > 0]

# Adds attributes for each possible finding, and assigns value of 1.0 for each finding
# in the Finding Label (1-Hot Encoding)
for c_label in all_labels:
    if len(c_label) > 1:
        xray_labels_df[c_label] = xray_labels_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

print(all_labels)
xray_labels_df.sample(5)

['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
99387,00026289_001.png,Infiltration,1,26289,35,F,PA,2822,2745,0.143,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
101936,00027113_000.png,,0,27113,26,F,PA,2766,2669,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99446,00026316_000.png,,0,26316,42,F,PA,2992,2991,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86886,00021440_000.png,,0,21440,54,M,PA,2732,2531,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103313,00027553_002.png,,2,27553,39,F,PA,2458,2577,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
MIN_CASES = 200
all_labels = [c_label for c_label in all_labels if xray_labels_df[c_label].sum() > MIN_CASES]

print('Labels ({})'.format(len(all_labels)),
    [(c_label, int(xray_labels_df[c_label].sum())) for c_label in all_labels])

Labels (7) [('Atelectasis', 692), ('Effusion', 700), ('Infiltration', 2292), ('Mass', 310), ('Nodule', 441), ('Pleural_Thickening', 207), ('Pneumothorax', 445)]


In [None]:
sample_weights = xray_labels_df['Finding Labels'].map(lambda x: len(x.split('|')) if len(x) > 0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()
xray_labels_df = xray_labels_df.sample(8000, weights = sample_weights)

In [8]:
xray_labels_df['disease_vec'] = xray_labels_df.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])
