# Transfer learning for COVID-19 Detection in X-Ray Images
## DD2424 Deep Learning - Group Project 

### Read Data

In [None]:
import cv2
import matplotlib.pyplot as plt
import glob
import os

In [None]:
import tensorflow
from tensorflow import keras
from keras import models, layers, optimizers
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.preprocessing.image import ImageDataGenerator

#### Filter difference between current generated dataset and paper dataset to get the correct data

In [None]:
def get_file_lines(filename):
    with open(filename) as f:
        content = f.readlines()
        
    file_lines = [x.strip() for x in content]
    return file_lines

def filter_out_difference(correct_data_description, wrong_data_description):
    correct_file_lines = get_file_lines(correct_data_description)
    wrong_file_lines = get_file_lines(wrong_data_description)
    
    nr_missing_images = 0
    
    # Need to check substrings as wrong_file_description has longer lines
    for correct_line in correct_file_lines:
        if not any(correct_line in line for line in wrong_file_lines):
            nr_missing_images += 1
                
    if nr_missing_images > 0:
        print('you are missing: ', nr_missing_images, 'images')
        return
    print('you have all images in ', correct_data_description)
    
    new_correct_upd_lines = []
    
    for wrong_line in wrong_file_lines:
        for correct_line in correct_file_lines:
            if correct_line in wrong_line:
                new_correct_upd_lines.append(wrong_line)
                   
    return new_correct_upd_lines
    
def write_line_list_to_file(file, line_list):
    with open(file, 'w') as filehandle:
        filehandle.writelines("%s\n" % line for line in line_list)
    


In [None]:
paper_test_data_description = 'data/paper_dataset_specifications/test_COVIDx2.txt'
paper_train_data_description = 'data/paper_dataset_specifications/train_COVIDx2.txt'

current_test_data_description = 'data/test_split_v3.txt'
current_train_data_description = 'data/train_split_v3.txt'

upd_test_set_list = filter_out_difference(paper_test_data_description, current_test_data_description)
upd_train_set_list = filter_out_difference(paper_train_data_description, current_train_data_description)


In [None]:
# writing updated and correct dataset description to txt file
write_line_list_to_file('data/correct_test_split.txt',upd_test_set_list)
write_line_list_to_file('data/correct_train_split.txt',upd_train_set_list)

### Reading only correct images from generated train and test data

In [None]:
def read_correct_images(img_dir, img_descriptions):
    data_path = os.path.join(img_dir,'*g')
    images = glob.glob(data_path)

    img_types = ['jpg', 'png', 'jpeg']
    nr_correct_imgs = 0

    for image_name in images:    
        image_name = image_name.replace(img_dir + '/', '')
        
        if any(image_name in img for img in get_file_lines(img_descriptions)):
            nr_correct_imgs += 1
    
    print('Nr correct images in dataset: ', nr_correct_imgs)

test_data = 'data/dataset/test'
test_data_description = 'data/correct_test_split.txt'

train_data = 'data/dataset/train'
train_data_description = 'data/correct_train_split.txt'

# Calculating number of correct images in dataset
read_correct_images(test_data, test_data_description)
read_correct_images(train_data, train_data_description)

### Demo loading data

In [None]:
# Testing to read an image
path = 'data/dataset/playground_dataset/0a6a5956-58cf-4f17-9e39-7e0d17310f67.png'
img = cv2.imread(path,0)

plt.imshow(img, cmap='gray')
plt.show()

### Proof of concept with simple ML model

### Pretrained VGG-16

#### Importing images from folders 

In [None]:
""" Image import: This was mainly to see if I could get the model working, ImageDataGenerator might not be the best choice.
However, if we want to do data augmentation later, these function are 
built-in here. To run the script, you need 3 folder: dataset/data/train, dataset/data/val, dataset/data/test. The first two should have 
three subfolders each: class1, class2, class3 with corresponding pictures"""

# folders
train_dir = 'data/dataset/johanna_testdata/train'
val_dir = 'data/dataset/johanna_testdata/val'
test_dir = 'data/dataset/johanna_testdata/test'

# constants
img_size = 150
batch_size = 16
epochs = 6
nb_train_samples = len(next(os.walk(train_dir))[2])
nb_val_samples = len(next(os.walk(val_dir))[2])
print("training samples:", nb_train_samples)
print("validation samples:", nb_val_samples)

datagen = ImageDataGenerator()
train_generator = datagen.flow_from_directory(
        train_dir,
        target_size=(img_size, img_size),
        batch_size=batch_size,
        class_mode='categorical')

val_generator = datagen.flow_from_directory(
        val_dir,
        target_size=(img_size, img_size),
        batch_size=batch_size,
        class_mode='categorical')

test_generator = datagen.flow_from_directory(
        test_dir,
        target_size=(img_size, img_size),
        batch_size=batch_size,
        class_mode='categorical',
        shuffle='false')

#### Training and testing model with frozen weights

In [None]:
print("Freezing weights and updating last layers")
vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))

#create model and freeze all but last conv block
vgg_conv = VGG16(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))
for layer in vgg_conv.layers[:-4]:
    layer.trainable = False

# add VGG network and extra layer to model
model = models.Sequential() 
model.add(vgg_conv)
model.add(layers.Flatten())
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))

# summerize and compile
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fine-tune model
history = model.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=epochs,
    validation_data=val_generator,
    validation_steps=nb_val_samples // batch_size)

# Save model and plot accuracy 
#model.save('transfer_model.h5')
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
eps = range(len(acc))
plt.plot(eps, acc, 'b', label='Training acc')
plt.plot(eps, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.show()

pred=model.predict_generator(test_generator)
print(pred)

#### Training and testing model with no frozen weights

In [None]:
# create vgg
vgg_conv_2 = VGG16(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))

# add vgg network and extra layer to model
model_2 = models.Sequential() 
model_2.add(vgg_conv_2)
model_2.add(layers.Flatten())
model_2.add(layers.Dense(1024, activation='relu'))
model_2.add(layers.Dropout(0.5))
model_2.add(layers.Dense(3, activation='softmax'))

#summerize and compile
model_2.summary()
model_2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history_2 = model_2.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=epochs,
    validation_data=val_generator,
    validation_steps=nb_val_samples // batch_size)

# Save the model
#model_2.save('non_transfer_model.h5')
acc = history_2.history['accuracy']
val_acc = history_2.history['val_accuracy']
eps = range(len(acc))
plt.plot(eps, acc, 'b', label='Training acc')
plt.plot(eps, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.show()

pred_2=model.predict_generator(test_generator)
print(pred)

### CNN Model from Scratch