# Exploratory Data Analysis of NIH Dataset

In [None]:
# common imports
import os
import numpy as np
import datetime
import time
import matplotlib.pylab as plt
import pandas as pd
from glob import glob
from pathlib import Path
from functools import partial
import plotly.express as px

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# prevent VRAM occupied
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# load tensorboard extension
%reload_ext tensorboard

In [None]:
nih_xrays_df = pd.read_csv('../data/processed/processed_data_entry_2017.csv')

In [None]:
nih_xrays_df.head()

Unnamed: 0.1,Unnamed: 0,image_name,finding_label,follow_up_num,patient_id,age,gender,view_position,image_width,image_height,x_spacing,y_spacing,path
0,0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,../data/raw/images_001/images/00000001_000.png
1,1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,../data/raw/images_001/images/00000001_001.png
2,2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,../data/raw/images_001/images/00000001_002.png
3,3,00000002_000.png,NoFinding,0,2,81,M,PA,2500,2048,0.171,0.171,../data/raw/images_001/images/00000002_000.png
4,4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,../data/raw/images_001/images/00000003_000.png


In [None]:
px.strip(nih_xrays_df, x='age', color='finding_label',  hover_name='gender')

In [None]:
px.histogram(nih_xrays_df, x='age', color='finding_label',  hover_name='gender')

In [None]:
label_counts = nih_xrays_df['finding_label'].value_counts()[:15].reset_index()
fig = px.bar(label_counts, x='index', y='finding_label')
fig.show()

In [9]:
# Get fourteen unique diagnosis
# It is a function that takes a series of iterables and returns one iterable
# The asterisk "*" is used in Python to define a variable number of arguments. 
# The asterisk character has to precede a variable identifier in the parameter list 
from itertools import chain
all_labels = np.unique(list(chain(*nih_xrays_df['finding_label'].map(lambda x: x.split('|')).tolist())))

In [10]:
all_labels

array(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
       'Mass', 'NoFinding', 'Nodule', 'Pleural_Thickening', 'Pneumonia',
       'Pneumothorax'], dtype='<U18')

In [11]:
# remove the empty label
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))

All Labels (15): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'NoFinding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


In [17]:
for label in all_labels:
     nih_xrays_df[label]= nih_xrays_df['finding_label'].map(lambda finding: 1.0 if label in finding else 0.0)

nih_xrays_df['disease_vector'] = nih_xrays_df.apply(lambda x : [x[all_labels].values], 1).map(lambda x: x[0])
    

In [18]:
nih_xrays_df.iloc[0]['disease_vector']

array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0], dtype=object)

In [19]:
finding_label_counts ={}
for label in all_labels:
    finding_label_counts[label] = [nih_xrays_df[label].sum()]

In [20]:
finding_label_counts_df = \
    pd.DataFrame.from_dict(finding_label_counts, \
                           orient='index',\
                           columns=['count'])\
                            .reset_index()
type(finding_label_counts_df)                        

pandas.core.frame.DataFrame

In [21]:
fig = px.bar(finding_label_counts_df, x='index', y='count')
fig.show()

In [None]:
 # Apply the min_cases logic
MIN_CASES_FLAG = True
MIN_CASES = 500
if MIN_CASES_FLAG:
    all_labels = [label for label in all_labels \
                                     if nih_xrays_df[label].sum() > MIN_CASES]
    print(f'finding labels with min cases: {len(all_labels)}')  
    print([(label, int(nih_xrays_df[label].sum())) for label in all_labels])

In [None]:
print('Number of unique patients:' , nih_xrays_df['patient_id'].nunique())

In [None]:
# since the dataset is very unbiased, we can resample it to be a more reasonable collection
# weight is 0.04 + number of findings
sample_weights = nih_xrays_df['finding_label'].map(lambda x: len(x.split('|')) if len(x)>0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()

nih_xrays_df = nih_xrays_df.sample(40000, weights=sample_weights)
nih_xrays_df.head()

In [None]:
label_counts = nih_xrays_df['finding_label'].value_counts()[:18]
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)