In [71]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from glob import glob
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['sample', 'sample_labels.csv']


In [72]:
# ../input/
PATH = os.path.abspath(os.path.join('..', 'input'))

# ../input/sample/images/
SOURCE_IMAGES = os.path.join(PATH, "sample", "images")

# ../input/sample/images/*.png
images = glob(os.path.join(SOURCE_IMAGES, "*.png"))

# Load labels
labels = pd.read_csv('../input/sample_labels.csv')

In [73]:
labels.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168


###Since there are some questions about the accuracy of the data labeling as discussed elsewhere, https://lukeoakdenrayner.wordpress.com/2017/12/18/the-chestxray14-dataset-problems/,  I want to make this data analysis meaningful by only detecting informative information by distinguishing between No Finding and Have Finding. This is already very valuable for helping doctors confirm the thought when facing some vague images.

###Here I want to dig into the age distribution of Have Findings for both female and male.

In [None]:
labels['Age Type']=labels['Patient Age'].apply(lambda x: x[-1:])
labels['Age Type'].unique()  # => Y, M and D

In [None]:
labels['Age']=labels['Patient Age'].apply(lambda x: x[:-1]).astype(int)

In [None]:
labels.loc[labels['Age Type']=='M',['Age']] = labels[labels['Age Type']=='M']['Age'].apply(lambda x: x/12).astype(int)
labels.loc[labels['Age Type']=='D',['Age']] = labels[labels['Age Type']=='D']['Age'].apply(lambda x: x/365).astype(int)

In [None]:
labels[labels['Age Type']=='M']

In [None]:
labels[labels['Finding Labels'] == 'No Finding']['Follow-up #'].value_counts()

In [None]:
labels[(labels['Finding Labels'] == 'No Finding')& (labels['Follow-up #'] == 148)]['Age']

In [None]:
labels['NumLabel']=labels['Finding Labels'].apply(lambda x: 'No Finding' if 'No Finding' in x else 'Found!')

In [None]:
labels[labels['Finding Labels'] != 'No Finding'].shape[0]

In [None]:
labels['Path'] = labels['Image Index'].apply(lambda x: os.path.join(SOURCE_IMAGES,x))

In [None]:
labels.head()

In [None]:
# load in all of the images
from skimage.io import imread
labels['image'] = labels['Path'].map(imread)

In [None]:
# see the image size distribution
labels['image'].map(lambda x: x.shape).value_counts()

In [None]:
import matplotlib.pyplot as plt
n_samples = 5
fig, m_axs = plt.subplots(2, n_samples, figsize = (3*n_samples, 3*2))
for n_axs, (type_name, type_rows) in zip(m_axs, 
                                         labels.sort_values(['NumLabel']).groupby('NumLabel')):
    n_axs[0].set_title(type_name)
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=2018).iterrows()):
        c_ax.imshow(c_row['image'],cmap = 'bone')
        c_ax.axis('off')
fig.savefig('category_samples.png', dpi=300)

In [None]:
labels['NumLabel']=labels['Finding Labels'].apply(lambda x: 0 if 'No Finding' in x else 1)

In [None]:
import cv2
def get_data():
    imgs_x = []
    label_y = []
    for index, row in labels.iterrows():
        if row['image'].shape == (1024,1024):
            imgs_x.append(cv2.resize(row['image'], (150,150), interpolation=cv2.INTER_CUBIC))
            label_y.append(row['NumLabel'])
    return imgs_x, label_y    

In [None]:
X, Y = get_data()
X = np.array(X)/255
X= np.reshape(X, X.shape + (1,))
Y = np.array(Y)

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split    
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2018)
print('Train Shape', X_train.shape, 'test shape', X_test.shape)

In [None]:
from keras.models import Sequential
from keras.optimizers import SGD
from keras.utils.vis_utils import plot_model
from keras.layers import Dropout
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import RMSprop


model = Sequential()
# conv1
model.add(Conv2D(32, kernel_size=(6,6),
                activation='relu',
                border_mode='same',
                input_shape=(150,150,1)))
model.add(MaxPooling2D(pool_size=(2, 2)))

# conv2
model.add(Conv2D(64, kernel_size=(2,2),
                 activation='relu',
                 border_mode='same'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# fc
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])

#history = model.fit(X_train, y_train, epochs = 30, batch_size = 40, verbose=1, validation_split=0.20)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()