In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

##Import any other packages you may need here
import imageio

EDA is open-ended, and it is up to you to decide how to look at different ways to slice and dice your data. A good starting point is to look at the requirements for the FDA documentation in the final part of this project to guide (some) of the analyses you do. 

This EDA should also help to inform you of how pneumonia looks in the wild. E.g. what other types of diseases it's commonly found with, how often it is found, what ages it affects, etc. 

Note that this NIH dataset was not specifically acquired for pneumonia. So, while this is a representation of 'pneumonia in the wild,' the prevalence of pneumonia may be different if you were to take only chest x-rays that were acquired in an ER setting with suspicion of pneumonia. 

Perform the following EDA:
* The patient demographic data such as gender, age, patient position,etc. (as it is available)
* The x-ray views taken (i.e. view position)
* The number of cases including: 
    * number of pneumonia cases,
    * number of non-pneumonia cases
* The distribution of other diseases that are comorbid with pneumonia
* Number of disease per patient 
* Pixel-level assessments of the imaging data for healthy & disease states of interest (e.g. histograms of intensity values) and compare distributions across diseases.

Note: use full NIH data to perform the first a few EDA items and use `sample_labels.csv` for the pixel-level assassements. 

In [None]:
## Below is some helper code to read all of your full image filepaths into a dataframe for easier manipulation

all_xray_df = pd.read_csv('data/Data_Entry_2017.csv')
all_xray_df.sample(3)

data_sample = pd.read_csv('sample_labels.csv')
data_sample.sample(3)

Also, **describe your findings and how will you set up the model training based on the findings.**

In [None]:
## EDA
xray = all_xray_df
xray["Patient Gender"] = np.where(xray["Patient Gender"] == "M",1,0)
plt.hist(xray["Patient Gender"],bins = 100);

In [None]:
def distributions():
    col = ["Follow-up #", "Patient Age", "Patient Gender", "View Position"]
    plt.subplot(2,2,1)
    sns.distplot(xray[col[0]], hist=True, kde=False, bins=50, color = 'blue', hist_kws = {'edgecolor':'black'});
    plt.subplot(2,2,2)
    sns.distplot(xray[col[1]], hist=True, kde=False, bins=20, color = 'blue', hist_kws = {'edgecolor':'black'});
    plt.subplot(2,2,3)
    sns.countplot(xray[col[2]], color = 'blue');
    plt.subplot(2,2,4)
    sns.countplot(xray[col[3]], color = 'blue');
distributions()

In [None]:
#Looks like we have data that doesn't make sense. Follow-up # and Patient Age are two culprits. Let's take a look
xray[xray["Follow-up #"] == xray["Follow-up #"].max()]

In [None]:
pd.set_option('display.max_rows', 10)
xray[xray["Patient ID"] == 10007]

# This patient seems to have shown every single disease in their 183 follow-ups! Let's assume this data is accurate.

In [None]:
# Now let's look at Patient Age
xray["Patient Age"].max()

In [None]:
# Maximum age is 414? That does not make sense!
np.transpose(set(xray["Patient Age"].values))

In [None]:
# Looks like every age after 95 is incorrect. We should drop this data.
xray = xray[xray["Patient Age"] <= 95]

In [None]:
#Let's check the distributions again
distributions()

Looks like our patients are 50-60 years old.
There's slightly more men than women.
There are more PA views than AP
Most of the images are from their first visit.

In [None]:
# Now let's take a look at the findings.
# First to extract the data in the  "Finding Labels" column and one-hot-ecode.

diseases = ["Atelectasis", "Consolidation", "Infiltration", "Pneumothorax", "Edema", "Emphysema", "Fibrosis", "Effusion", "Pneumonia", "Pleural_Thickening", "Cardiomegaly", "Nodule", "Mass", "Hernia", "No Finding"]

for label in diseases:
    xray[label] = xray["Finding Labels"].map(lambda finding: 1 if label in finding else 0)

In [None]:
# Now see how many pneumonia vs non-pneumonia cases we have.
print("Cases with Pneumonia: ", (xray["Finding Labels"] == "Pneumonia").sum())
print("Cases without Pneumonia: ", xray["Pneumonia"].count()-xray["Pneumonia"].sum())

In [None]:
# Now lets look at how the diseases are distributed. Since we are focusing on Pneumonia, we'll focus those cases that have Pneumonia and also with cormobidity.
pneumonia = xray[xray["Finding Labels"].str.contains("Pneumonia")]["Finding Labels"]
pneumonia.value_counts()[:20].plot(kind="bar")

In [None]:
# Pneumonia with no co-morbodity has the highest prevalence compared to those with comorbidities.
# Now let's look at the number of diseases each patient has

xray["# of diseases"] = 0
xray["# of diseases"] = xray[xray.columns[12:]].sum(axis=1)
sns.countplot(xray["# of diseases"])

In [None]:
# Most people have only 1 detected disease.

# Now to check distribution of pixel data on a sample of images

In [None]:
#data_sample["Image Index"]

In [None]:
# get the aboslute paths for all images
paths_list = []
d = %pwd
d += "\data\\"
for dirpath,_,filenames in os.walk(d):
    for f in filenames:
        if ".png" in f:
            paths_list.append(os.path.abspath(os.path.join(dirpath, f)))

data_sample["File Path"] = [image for image in paths_list if image[-16:] in data_sample["Image Index"].values]

In [None]:
# Intensity for lungs with No Finding
def label_distribution(label):
    no_finding_data = data_sample[data_sample["Finding Labels"].str.match("No Finding")]
    img = np.zeros(1024*1024)
    for path in no_finding_data["File Path"]:
        img += imageio.imread(path, pilmode="L").ravel()
    img /= len(no_finding_data)
    img = img.reshape((1024,1024))
    sns.distplot(img.ravel(), bins=256,kde=False, ax=ax[0])
label_distribution("No Finding")

In [None]:
#Intensities for lungs with Pneumonia
label_distribution("Pneumonia")


In [None]:
# Images with Pneumonia have more pixles with medium-lower intensities