# Transfer learning for COVID-19 Detection in X-Ray Images
## DD2424 Deep Learning - Group Project 

### Read Data

In [1]:
import matplotlib.pyplot as plt
import glob
import os
import cv2
import numpy as np

#### Filter difference between current generated dataset and paper dataset to get the correct data

In [3]:
def get_file_lines(filename):
    with open(filename) as f:
        content = f.readlines()
        
    file_lines = [x.strip() for x in content]
    return file_lines

def filter_out_difference(correct_data_description, wrong_data_description):
    correct_file_lines = get_file_lines(correct_data_description)
    wrong_file_lines = get_file_lines(wrong_data_description)
    
    nr_missing_images = 0
    
    # Need to check substrings as wrong_file_description has longer lines
    for correct_line in correct_file_lines:
        if not any(correct_line in line for line in wrong_file_lines):
            nr_missing_images += 1
                
    if nr_missing_images > 0:
        print('you are missing: ', nr_missing_images, 'images')
        return
    print('you have all images in ', correct_data_description)
    
    new_correct_upd_lines = []
    
    for wrong_line in wrong_file_lines:
        for correct_line in correct_file_lines:
            if correct_line in wrong_line:
                new_correct_upd_lines.append(wrong_line)
                   
    return new_correct_upd_lines
    
def write_line_list_to_file(file, line_list):
    with open(file, 'w') as filehandle:
        filehandle.writelines("%s\n" % line for line in line_list)
    


In [None]:
paper_test_data_description = 'data/paper_dataset_specifications/test_COVIDx2.txt'
paper_train_data_description = 'data/paper_dataset_specifications/train_COVIDx2.txt'

current_test_data_description = 'data/test_split_v3.txt'
current_train_data_description = 'data/train_split_v3.txt'

upd_test_set_list = filter_out_difference(paper_test_data_description, current_test_data_description)
upd_train_set_list = filter_out_difference(paper_train_data_description, current_train_data_description)


In [None]:
# writing updated and correct dataset description to txt file
# write_line_list_to_file('data/correct_test_split.txt',upd_test_set_list)
# write_line_list_to_file('data/correct_train_split.txt',upd_train_set_list)

In [5]:
# Moving images to class partitioned directory
mapping = {
            'normal': 0,
            'pneumonia': 1,
            'COVID-19': 2}

def get_label(img_desc_list):
        for class_name in mapping:
            if class_name in img_desc_list:
                return mapping[class_name]

current_dir = 'data/dataset/test/test_data'
new_dir = 'data/dataset/test/class'
img_descriptions = get_file_lines('data/correct_test_split.txt')


data_path = os.path.join(current_dir, '*g')
images = glob.glob(data_path)

for image_path in images:
            image_name = image_path.replace(current_dir + '/', '')
            for img_desc in img_descriptions:
                if image_name in img_desc:
                    img_desc_list = img_desc.split()
                    label = get_label(img_desc_list)
                    os.rename(image_path, new_dir +str(label) + '/' + image_name)

### Class for handling data

In [None]:
# Much of this class in unnecessary now adays
class Dataset:
    def __init__(
            self,
            test_img_dir,
            train_img_dir,
            test_img_descriptions_file,
            train_img_descriptions_file,
            input_shape = (224, 224),
            batch_size = 10
                 ):
        self.test_img_dir = test_img_dir
        self.train_img_dir = train_img_dir
        self.test_img_descriptions = get_file_lines(test_img_descriptions_file)
        self.train_img_descriptions = get_file_lines(train_img_descriptions_file)
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.batch_nr = 1
        self.max_batch = len(self.train_img_descriptions) // self.batch_size
        self.mapping = {
                'normal': 0,
                'pneumonia': 1,
                'COVID-19': 2}

        self.y_train = None
        self.y_test = None
        self.x_batch = None
        self.y_batch = None

    
    def get_current_batch(self):
        return self.x_batch, self.y_batch
    
    def _get_class_dist(self, y):
        class_dist = {}
        for class_name in self.mapping:
            class_dist[class_name] = np.count_nonzero(y == self.mapping[class_name])
        return class_dist

    def get_test_class_dist(self):
        test_class_dist = self._get_class_dist(self.y_test)

        return test_class_dist

    def read_test_data(self):
        self.x_test, self.y_test = self._read_correct_images(self.test_img_dir, self.test_img_descriptions)

    def _get_label(self, img_desc_list):
        for class_name in self.mapping:
            if class_name in img_desc_list:
                return self.mapping[class_name]
            
    # Must load training data in batches since memory error otherwise    
    def next_train_batch(self):
        
        if self.batch_nr == self.max_batch:
            print('No data left')
            return [], []
        
        start_img = (self.batch_nr-1) * self.batch_size
        end_img = (self.batch_nr) * self.batch_size        
                
        batch_descriptions = self.train_img_descriptions[start_img: end_img]
        
        self.x_batch, self.y_batch = self._read_correct_images(self.train_img_dir, batch_descriptions)
        
        self.batch_nr += 1
        

    def _read_correct_images(self, img_dir, img_descriptions):
        data_path = os.path.join(img_dir, '*g')
        images = glob.glob(data_path)
        
        x_list = []
        y_list = []
        
        for image_path in images:
            image_name = image_path.replace(img_dir + '/', '')
            for img_desc in img_descriptions:
                if image_name in img_desc:
                    img_desc_list = img_desc.split()
                    label = self._get_label(img_desc_list)
                    y_list.append(label)

                    img_array = cv2.imread(image_path)
                    resized_img_array = cv2.resize(img_array, self.input_shape)
                    x_list.append(resized_img_array)
                    
        y_array = np.array(y_list)
        x_array = np.array(x_list)        

        return x_array, y_array

### Loading data

In [None]:
test_data = 'data/dataset/test'
test_data_description = 'data/correct_test_split.txt'

train_data = 'data/dataset/train'
train_data_description = 'data/correct_train_split.txt'

dataset = Dataset(
        test_data,
        train_data,
        test_data_description,
        train_data_description
       )

# Read test data
dataset.read_test_data()

print(dataset.get_test_class_dist())

In [None]:
x_train.shape

### Proof of concept with simple ML model

### Pretrained VGG-16

#### Training and testing model with freezed weights

##### Importing packages

In [7]:
import os
import tensorflow
from tensorflow import keras
from keras import models, layers, optimizers
from keras import callbacks
from keras.callbacks import ReduceLROnPlateau
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.preprocessing.image import ImageDataGenerator

##### Image directories, constants, use of generator and use of validation data

In [8]:
"""Image import/process can be done using ImageDataGenerator, which enables easy data augmentation if we wish.
If more suitable, put use_generator=false and specify all x and y""" 

# constants as in report
img_size = 224 # double check in report
learning_rate=2e-5
epochs = 3 # in report 22
batch_size = 8
factor=0.7 # used in callback
patience = 5 # used in callback
optimizer='Adam'

# adjust for number of samples
nb_train_samples = 20
nb_val_samples = 12

# specify use of validation and/or generator feature
use_generator = True
use_validation = True 
use_callbacks = False

if use_generator:
    train_dir = 'data/dataset/johanna_testdata/train'
    val_dir = 'data/dataset/johanna_testdata/val'
    test_dir = 'data/dataset/johanna_testdata/test'
    
if not use_generator: 
    x_train, y_train = None, None
    x_test, y_test = None, None
    val_data = None 
    
if use_callbacks: 
    callbacks = [ReduceLROnPlateau(
    monitor='loss', 
    factor=factor, 
    patience=patience, 
    verbose=0)]

if not use_callbacks:
    callbacks = None


##### Creating model and freezing weights

In [9]:
#create model and freeze all but last conv block
vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))
for layer in vgg_conv.layers[:-4]:
    layer.trainable = False

# model = "the shell", adding VGG network and extra layers to model
model = models.Sequential() 
model.add(vgg_conv)
model.add(layers.Flatten()) # in the report, this results in output shape 100,352 instead of 25,088
model.add(layers.Dense(1024))
model.add(layers.Dense(1024))
model.add(layers.Dense(3, activation='softmax'))

# summerize and compile
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 7, 7, 512)         14714688  
_________________________________________________________________
flatten_4 (Flatten)          (None, 25088)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 1024)              25691136  
_________________________________________________________________
dense_10 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 3075      
Total params: 41,458,499
Trainable params: 33,823,235
Non-trainable params: 7,635,264
_________________________________________________________________


##### Run if ImageDataGenerator() is used to process images:

In [11]:
# Rescale=1./255 --> normalizing pixel values to be in range 0-1 (max pixel value is 255). Our pictures are
# grayscale (1 dimension), since VGG requires RGB (3 dimensions), the generators will copy our 1D data to every
# dimension => (PixelValue, PixelValue, PixelValue). This is Okay.
if use_generator: 
    datagen = ImageDataGenerator(Rescale=1./255)
    train_generator = datagen.flow_from_directory(
            train_dir,
            target_size=(img_size, img_size),
            batch_size=batch_size,
            class_mode='categorical')

    test_generator = datagen.flow_from_directory(
            test_dir,
            target_size=(img_size, img_size),
            batch_size=batch_size,
            class_mode='categorical')

    if use_validation:
        val_generator = datagen.flow_from_directory(
            val_dir,
            target_size=(img_size, img_size),
            batch_size=batch_size,
            class_mode='categorical')
    else:
        val_generator = None
        
    print("Generators generated :-)")
    
    # fine-tune model
    history = model.fit_generator(
        train_generator,
        steps_per_epoch=nb_train_samples // batch_size,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=nb_val_samples // batch_size, 
        verbose=0, 
        callbacks = callbacks)
    
    print("Model fine-tuned")
    
    # evaluate model
    result = model.evaluate_generator(test_generator)
    
    print("Model evaluated")

Found 20 images belonging to 3 classes.
Found 9 images belonging to 3 classes.
Found 12 images belonging to 3 classes.
Generators generated :-)
Model fine-tuned
Model evaluated


##### Run if ImageDataGenerator is NOT used to process images: 

In [None]:
if not use_generator:
    model.fit(
        x=x_train,
        y=y_train, 
        batch_size=batch_size, 
        epochs=epochs,
        validation_data=val_data,
        validation_steps= nb_val_samples // batch_size,
        verbose=0
        callbacks = [callback])
    
    result = model.evaluate(
        x=x_test, 
        y=y_test, 
        batch_size=batch_size, 
        verbose=0)
else: 
    continue

##### Print accuracy for model with frozen weights

In [12]:
print("Results:")
print(model.metrics_names[1], result[1])
#print(model.metrics_names[2], result[2])
#prediction = model.predict_generator(test_generator)
#print(prediction)
#model.save('transfer_model.h5')

Results:
accuracy 0.3333333432674408


#### Training and testing model with no frozen weights

### CNN Model from Scratch