In [None]:
import math
import numpy as np
import pandas as pd
from keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import Model
from keras.layers import GlobalAveragePooling2D, Dense, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow_addons.metrics import F1Score
from tensorflow.keras.applications.resnet import preprocess_input


In [None]:
data = pd.read_csv("../Data_Entry_2017.csv") 
filenames = [] 
directories = []
# parse each of the filenames used in analysis, and filter dataframe accordingly
with open("../selected_png_list.txt", "r") as fr: # read all 9600 selected filenames
    for line in fr.readlines():
        start = line.find("images_0")
        directory = "images/" + line[start:]
        filename_reverse = line[::-1]
        filename_reverse = filename_reverse[:filename_reverse.find("/")]
        filename = filename_reverse[::-1]
        filenames.append(filename[:-1])
        directories.append(directory)
data = data[data["Image Index"].isin(filenames)]

In [None]:
# treat multiple classifications from individual radiologists equally
# assigns new row for each individual classification present (classifications separated by "|" in dataset)
for index,row in data.iterrows():
    label = row["Finding Labels"]
    if label.find("|") != -1:
        labels = label.split("|") # split by "|" to get all labels
        newrow = row 
        inserted_data = []
        for label in labels:
            newrow["Finding Labels"] = label # make new row according to each label
            inserted_data.append(newrow)
        inserted_data = pd.DataFrame(inserted_data)
        print(inserted_data)
        data = pd.concat([data, inserted_data],ignore_index=True) # concatenate new rows and original data
        
# step to remove each of the rows that contain "|" because they represent multiple classifications from different radiologists
# each image gets placed into separate rows in the dataframe according to each classification, so each classification is used in analysis
for index, row in data.iterrows(): 
    if row["Finding Labels"].find("|") != -1:
        data.drop(index, inplace=True)
data.to_csv("full_data.csv") # store adjusted new data into full_data.csv

In [None]:
classifications = np.unique(data.loc[:,"Finding Labels"])
print(len(classifications))

In [None]:
# splitting train and test set (80/20), same split used for other methods
train_full_set = []
test_full_set = []
data = pd.read_csv("full_data.csv") 
for classification in classifications: # iterate over each disease classification
    class_filter = data[data["Finding Labels"] == classification]
    print(classification, len(class_filter))
    indices_len = len(class_filter)
    if indices_len > 400: # attempt to adjust for class imbalance by limiting higher # class to 400
        indices_len = 400
    train_set_size = math.floor(0.8 * indices_len)
    
    # get random indices to select for train and test set
    permutation = np.random.permutation(np.arange(0,indices_len))
    permutation = [int(x) for x in permutation]
    
    train_filenames = class_filter.iloc[permutation[:train_set_size], 1] # filter dataframe by index to find randomized indices
    test_filenames = class_filter.iloc[permutation[train_set_size:], 1] # same for test set, but for other 20%

    for filename in train_filenames:
        train_full_set.append(filename) # append filename to train set
    for filename in test_filenames:
        test_full_set.append(filename) # append filenmae to test set

train_data = data[data["Image Index"].isin(train_full_set)] # filter dataframe according to filenames found in train set
test_data = data[data["Image Index"].isin(test_full_set)]  # do same for test set

In [None]:
classifications_observed = classifications[classifications != "No Finding"] 
model_predictions = []
histories = []
for classification in classifications_observed: # iterate over all disease classifications
        # assemble ResNet50 model
        base_model = ResNet50(include_top=False, 
                                    classes=1,
                                    input_shape=(1024,1024,3))
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = LeakyReLU(alpha = 0.4)(x) 
        predictions = Dense(1, activation='sigmoid')(x)
        
        model = Model(inputs=base_model.input, outputs=predictions)
        
        # compile ResNet50 model
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall(), F1Score(num_classes=1, average = "weighted"), AUC(curve = "ROC")])

        print(f"Training ResNet50 model on {classification}")
        possible_classes = [classification, "No Finding"]
        
        # prepare train set data
        train_data_subset = train_data[train_data["Finding Labels"].isin(possible_classes)]
        train_datagen = ImageDataGenerator(rescale = 1./255, preprocessing_function=preprocess_input)
        train_generator = train_datagen.flow_from_dataframe(
                train_data_subset,
                x_col = "Image Index",
                y_col = "Finding Labels",
                directory = "images",
                target_size=(224,224),
                batch_size=16,
                class_mode='binary',
                shuffle=True)

        # fit model on train_set 
        history = model.fit(
                train_generator,
                steps_per_epoch=len(train_generator),
                epochs=10,
                verbose=1)
        
        
        print(f"Testing ResNet50 model on {classification}")
         # test data preparation
        test_data_subset = test_data[test_data["Finding Labels"].isin(possible_classes)]
        test_datagen = ImageDataGenerator(rescale=1./255,preprocessing_function=preprocess_input)
        test_generator = test_datagen.flow_from_dataframe(
                test_data_subset,
                x_col = "Image Index",
                y_col = "Finding Labels",
                directory = "images",
                target_size=(224,224),
                batch_size=16,
                class_mode='binary',
                shuffle=True)
        
        # append train_set model to histories
        histories.append(history)
        model_predictions.append(model.predict(test_generator)) 
        model.evaluate(test_generator) # run model on test set
        