In [None]:
####### Assorting Downloaded Data into Trainable Format ######3

In [None]:
import os
import zipfile
import tarfile
import pandas as pd

path = '/Volumes/drive/'
lis = ['001', '002', '003', '004', '005', '006', '007', '008', '009', '010', '011', '012']
for l in lis:
    fn = 'images_' + l + '.tar.gz'
    # open file
    file = tarfile.open('/Volumes/drive/im/CXR8/images/' + fn)
    file.extractall('/Volumes/drive/image_data2/')
    file.close()
    print(fn)

labels_df = pd.read_csv("labels.csv")
labels_df['Index'] = labels_df['Image Index']
labels_df = labels_df.query('Index.isin(@dir_list)')
labels = labels_df.sort_values('Finding Labels')

In [None]:
class_names = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 
'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening','Hernia', 'No Finding']

train_images = '/train'
train_cat = '/train_'
#creating subfolders
for i in class_names:
    os.makedirs(os.path.join('train_', i))

In [None]:
#moving the image files to their respective categories
import shutil
for c in class_names: # Category Name
    for i in list(labels[labels['Finding Labels']==c]['Image Index']): # Image Id
        try:
            get_image = os.path.join('image_data/images/', i)
            move_image_to_cat = shutil.move(get_image, 'train_/'+c)# Path to Images
            
        except:
            pass

In [None]:
########################### CNN ####################################
import tensorflow as tf
import os
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [None]:
devices = tf.config.list_physical_devices()
print(devices)

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  image_size= (128,128),
  validation_split=0.2,
  subset="training",
  color_mode='rgb',
  batch_size= 32,
  seed=123)

In [None]:
########## using validation data for test purposes instead of validation #######
test_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  validation_split=0.2,
  subset="validation",
  color_mode = "rgb",
  batch_size = 32,
  seed=123,
  image_size=(128, 128))

In [None]:
import matplotlib.pyplot as plt
class_names = train_ds.class_names
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(3):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    v = images[i].numpy().astype("uint8")
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache()
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
num_classes = len(class_names)
model = Sequential([
  layers.Rescaling(1./255, input_shape=(128, 128, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(train_ds,epochs=10)

In [None]:
model.save('cnn_model')

In [None]:
model = tf.keras.models.load_model('cnn_model')

In [None]:
epochs = 10
acc = history.history['accuracy']
loss = history.history['loss']
epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.legend(loc='lower right')
plt.title('Training Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.legend(loc='upper right')
plt.title('Training Loss')
plt.show()

In [None]:
#Iterate thru all the layers of the model
for layer in model.layers:
    if 'conv' in layer.name:
        weights, bias= layer.get_weights()
        
        #normalize filter values between  0 and 1 for visualization
        f_min, f_max = weights.min(), weights.max()
        filters = (weights - f_min) / (f_max - f_min)  
        print(filters.shape[3])
        filter_cnt=1
        
        #plotting all the filters
        for i in range(filters.shape[3]):
            #get the filters
            filt=filters[:,:,:, i]
            #plotting each of the channel, color image RGB channels
            for j in range(filters.shape[0]):
                ax= plt.subplot(filters.shape[3], filters.shape[0], filter_cnt  )
                ax.set_xticks([])
                ax.set_yticks([])
                plt.imshow(filt[:,:, j])
                filter_cnt+=1
        plt.show()


In [None]:
import numpy as np
np.shape(v)

In [None]:
x = v
x = x/255.0
x = np.expand_dims(x, axis = 0)
successive_outputs = [layer.output for layer in model.layers[1:]]
#visualization_model = Model(img_input, successive_outputs)
visualization_model = tf.keras.models.Model(inputs = model.input, outputs = successive_outputs)
# Let's run input image through our vislauization network
# to obtain all intermediate representations for the image.
successive_feature_maps = visualization_model.predict(x)
# Retrieve are the names of the layers, so can have them as part of our plot
layer_names = [layer.name for layer in model.layers]
for layer_name, feature_map in zip(layer_names, successive_feature_maps):
  print(feature_map.shape)
  if len(feature_map.shape) == 4:
    
    # Plot Feature maps for the conv / maxpool layers, not the fully-connected layers
   
    n_features = feature_map.shape[-1]  # number of features in the feature map
    size       = feature_map.shape[ 1]  # feature map shape (1, size, size, n_features)
    
    # We will tile our images in this matrix
    display_grid = np.zeros((size, size * n_features))
    
    # Postprocess the feature to be visually palatable
    for i in range(n_features):
      x  = feature_map[0, :, :, i]
      x -= x.mean()
      x /= x.std ()
      x *=  64
      x += 128
      x  = np.clip(x, 0, 255).astype('uint8')
      # Tile each filter into a horizontal grid
      display_grid[:, i * size : (i + 1) * size] = x
# Display the grid
    scale = 20. / n_features
    plt.figure( figsize=(scale * n_features, scale) )
    plt.title ( layer_name )
    plt.grid  ( False )
    plt.imshow( display_grid, aspect='auto', cmap='viridis' )
    plt.savefig("fig1.png")


In [None]:
mod = tf.keras.models.Model(inputs=model.inputs , outputs=model.layers[1].output)
#calculating features_map
x = v
x = x/255.0
x = np.expand_dims(x, axis = 0)
features = mod.predict(x)

fig = plt.figure(figsize=(20,15))
for i in range(1,features.shape[3]+1):

    plt.subplot(8,8,i)
    plt.imshow(features[0,:,:,i-1] , cmap='viridis')
    
plt.show()


In [None]:
################### Building VAE ############################
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

AUTOTUNE = tf.data.AUTOTUNE
dataset= tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  image_size= (128,128),
  color_mode= "rgb",
  validation_split=0.2,
  subset = 'training',
  batch_size= 32,
  seed=123)
normalization_layer = tf.keras.layers.Rescaling(1./255)
normalized_ds = dataset.map(lambda x, y: normalization_layer(x))

train_ds = normalized_ds.cache().prefetch(buffer_size=AUTOTUNE)

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon




In [None]:
latent_dim = 6272

encoder_inputs = keras.Input(shape=(128, 128, 3))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

In [None]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(32 * 32 * 8, activation="relu")(latent_inputs)
x = layers.Reshape((32, 32, 8))(x)
x = layers.Conv2DTranspose(256, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(256, 3, activation="relu", strides=2, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(3, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

In [None]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [None]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam())
vae.fit(train_ds, epochs=1)

In [None]:
encoder.save('encoder_6272')
decoder.save('decoder_6272')

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  image_size= (128,128),
  validation_split=0.2,
  subset="training",
  color_mode='rgb',
  batch_size= 32,
  seed=123)

import matplotlib.pyplot as plt
class_names = train_ds.class_names
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(3):
  for i in range(6):
    ax = plt.subplot(3, 3, i + 1)
    v = np.expand_dims(images[i].numpy(), axis = 0)
    z_mean, z_log_var, z = vae.encoder.predict(v)
    reconstruction = decoder(z)
    v = reconstruction.numpy()
    ve = np.squeeze(v)
    plt.imshow(ve)
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
import seaborn as sns
def plot_label_clusters(vae, data, labels):
    # display a 2D plot of the digit classes in the latent space
    z_mean, _, _ = vae.encoder.predict(data)
    ax = sns.scatterplot(x = z_mean[:, 0]/255,y = z_mean[:, 1]/255, hue=labels)
    ax.set(xlabel='Dimension 1', ylabel='Dimension 2')
    ax.set_title("Projection of 2D Latent Space (X-Ray Scans)")
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plot_label_clusters(vae, train_ds.take(25), class_label_list)

In [None]:
###################### ENCODER + MLP ##################


train_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  image_size= (128,128),
  validation_split=0.2,
  subset="training",
  color_mode='rgb',
  batch_size= 32,
  seed=123)


encoder = tf.keras.models.load_model('encoder_6272', compile = 'False')
decoder = tf.keras.models.load_model('decoder_6272', compile = 'False')

In [None]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [None]:
train_label_list = []
test_label_list = []
i = 0
for images,labels in train_ds:
    for i in range(32):
        try:
            train_label_list.append(labels[i].numpy())
        except:
            pass

for images,labels in test_ds:
    for i in range(32):
        try:
            test_label_list.append(labels[i].numpy())
        except:
            pass
        

In [None]:
z_mean, z_log_var, z =  encoder.predict(train_ds)

In [None]:
v_mean, v_log_var, v =  encoder.predict(test_ds)

In [None]:
num_classes = len(train_ds.class_names)
enc_model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(32),
  tf.keras.layers.Dropout(0.2)
])
enc_model.add(tf.keras.layers.Dense(15, activation='softmax'))

In [None]:
enc_model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',
              metrics=['accuracy'])

In [None]:
import numpy as np
X = np.array(z)
y = np.array(train_label_list)
X_test = np.array(v)
y_test = np.array(test_label_list)

In [None]:
enc_history = enc_model.fit(X,y, epochs = 10)

In [None]:
enc_model.summary()

In [None]:
epochs = 10
acc = enc_history.history['accuracy']
loss = enc_history.history['loss']
epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.legend(loc='lower right')
plt.title('Training Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.legend(loc='upper right')
plt.title('Training Loss')
plt.show()

In [None]:
enc_model.save("enc_model")

In [None]:
######################## MODEL COMPARISON #####################

In [None]:
### Evaluaton
print("Evaluateion on test data for CNN model ")
results = model.evaluate(test_ds, batch_size=32)
print("test loss, test acc:", results)

print("Evaluate on test data for encoder + MLP")
results = enc_model.evaluate(X_test, y_test, batch_size=32)
print("test loss, test acc:", results)

In [None]:
################ ALTERNATE METHOD: RBM ################
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from sklearn.neural_network import BernoulliRBM

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  image_size= (128,128),
  validation_split=0.2,
  subset="training",
  color_mode='grayscale',
  batch_size= 32,
  seed=123)

test_ds = tf.keras.utils.image_dataset_from_directory(
  '/Users/Jag/Documents/train_/',
  validation_split=0.2,
  subset="validation",
  color_mode = "grayscale",
  batch_size = 32,
  seed=123,
  image_size=(128, 128))

In [None]:
train_image_list = []
test_image_list = []
train_label_list = []
test_label_list = []
i = 0
for images,labels in train_ds.take(100):
    for i in range(32):
        try:
            train_image_list.append(images[i].numpy())
            train_label_list.append(labels[i].numpy())
        except:
            pass

for images,labels in test_ds.take(100):
    for i in range(32):
        try:
            test_image_list.append(images[i].numpy())
            test_label_list.append(labels[i].numpy())
        except:
            pass
        

In [None]:
X = np.array(train_image_list)
y = np.array(train_label_list)
X_test = np.array(test_image_list)
y_test = np.array(test_label_list)

In [None]:
np.shape(y)

In [None]:
np.save('train_data.npy', X)
np.save('train_labels.npy', y)

In [None]:
with open('encoded_data.npy', 'rb') as f:
        X = np.load(f)

with open('train_labels.npy', 'rb') as f:
        y = np.load(f)

In [None]:
from sklearn.neural_network import BernoulliRBM

In [None]:
X_train = X.reshape(-1, 128*128*1)/255
X_test = X_test.reshape(-1, 128*128*1)/255

In [None]:
np.shape(X_train)

In [None]:
X_train = np.where(X_train > 0.2, 1, 0)
X_test = np.where(X_test > 0.2, 1, 0)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))

for i in range(5):
  plt.subplot(2,5,i+1)
  plt.xticks([])
  plt.yticks([])
  plt.grid(False)
  plt.imshow(X_train[i].reshape(128,128,1), cmap='Greys')
plt.tight_layout()

In [None]:
rbm = BernoulliRBM(n_components=100, learning_rate=0.01, random_state=42, verbose=True)
rbm.fit(X_train)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

neu = MLPClassifier(hidden_layer_sizes=(128,32,15),activation="relu")
rbm = BernoulliRBM(random_state=0, verbose=True)

rbm_features_classifier = Pipeline(steps=[("rbm", rbm), ("new", neu)])

In [None]:
# Hyper-parameters. These were set by cross-validation,
# using a GridSearchCV. Here we are not performing cross-validation to
# save time.
rbm.learning_rate = 0.06
rbm.n_iter = 10

# More components tend to give better prediction performance, but larger
# fitting time
rbm.n_components = 100
# Training RBM-Logistic Pipeline
rbm_features_classifier.fit(X_train, y)


In [None]:
from sklearn import metrics

Y_train= rbm_features_classifier.predict(X_train)
print(
    "Logistic regression using RBM features:\n%s\n"
    % metrics.classification_report(y, Y_train)#(metrics.classification_report(y_test, Y_pred))
)


In [None]:
Y_pred = rbm_features_classifier.predict(X_test)

In [None]:
for i, comp in enumerate(rbm.components_):
    plt.subplot(10, 10, i + 1)
    plt.imshow(comp.reshape((128, 128)), cmap='Accent',
               interpolation='nearest', vmin=-2.5, vmax=2.5)
    plt.axis('off')