# Proyecto 2 - Redes Neuronales
Curso: Introducción al Reconocimiento de Patrones

Estudiantes:
- Juan Ignacio Navarro Navarro
- Jose David Sánchez

In [9]:
"""
Used libraries for the entire project
"""
import torch
import os
import optuna
import time
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Subset, random_split, ConcatDataset, DataLoader
from PIL import Image
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

## Carga del Set de Datos de Rayos X en Pulmones


Los sets de datos utilizados son:
- https://www.kaggle.com/datasets/preetviradiya/covid19-radiography-dataset (original propuesto por el profesor, guardado en images/original_dataset)
- https://www.kaggle.com/datasets/gibi13/pneumonia-covid19-image-dataset (adicional para disminuir sesgo, guardado en images/additional_dataset)

### Feature engineering 

In [None]:
"""
-- Load images from additional dataset to decrease bias --

Description: This cell moves the images needed to decrease the
models bias from the additional dataset to the analysis dataset.
The analysis dataset contains the needed images from both original
and additional dataset.

Since the images from both dataset had different sizes this cell
also defines a standard size.
"""

# Define the input folder with the original images
input_folder = "images/additional_dataset/"

# Define the output folder to save the resized
output_folder = "images/analysis_dataset/"

# Define the target size for the resized images
target_size = (299, 299)

categories = ['COVID', 'Viral_Pneumonia']

for category in categories:
    # Get the list of files in the input folder
    file_list = os.listdir(os.path.join(input_folder, category))

    # Iterate over each file in the input folder
    for file_name in file_list:
        input_path = os.path.join(input_folder, category, file_name)
        
        image = Image.open(input_path)
        resized_image = image.resize(target_size)
        output_path = os.path.join(output_folder, category, file_name)
        
        # Save the resized image to the output folder
        resized_image.save(output_path)
        image.close()


In [2]:
"""
-- Delete images from cropped dataset --

This cell deletes the files in the cropped dataset if any. The
cropped dataset a cropped copy from the additional dataset images.
The images are cropped to show only the most important part of 
the X-rays, which is the lungs
"""

cropped_images_folder = 'images/cropped_dataset/'
categories = {'COVID': 0, 'Lung_Opacity': 1, 'Normal': 2, 'Viral_Pneumonia': 3}

for category in categories.keys():
    folder_path = os.path.join(cropped_images_folder, category)

    file_list = os.listdir(folder_path)

    for file_name in file_list:
        file_path = os.path.join(cropped_images_folder, category, file_name)
        os.remove(file_path)


In [3]:
"""
-- Crop images borders and ignore extra images on bias --

This cell crops the images borders on analysis dataset an saves
them into the cropped dataset folder. It only saves the images 
needed to train the model without bias.
"""

# define a different folder to save the cropped images
original_images_folder = 'images/analysis_dataset/'
cropped_images_folder = 'images/cropped_dataset/'

# define the new size
target_size = (250, 250)

categories = {'COVID': 0, 'Lung_Opacity': 1, 'Normal': 2, 'Viral_Pneumonia': 3}
category_amount = []

# get the amount of images for each category
for category in categories.keys():
    folder_path = os.path.join(original_images_folder, category)
    image_files = os.listdir(folder_path)
    category_amount.append(len(image_files))

print("Amount of images in analysis dataset:")
print(f"\tCOVID\t\t\t : {category_amount[0]}\n \
        Lung_Opacity\t\t : {category_amount[1]}\n \
        Normal\t\t\t : {category_amount[2]}\n \
        Viral_Pneumonia\t : {category_amount[3]}\n")

max_training = min(category_amount)
print("Maximum amount of images to use in the training: ", min(category_amount))

# crop and save the cropped images
for category in categories.keys():

    cat_files = os.listdir(os.path.join(original_images_folder, category))

    for i, file in enumerate(cat_files):

        if i == max_training: break
        # constructing image path
        input_path = os.path.join(original_images_folder, category, file)
        image = Image.open(input_path)

        if image.size[0] < 299 or image.size[1] < 299:
            continue
        
        # get original image size and calculate borders
        width, height = image.size
        left = (width - target_size[0]) // 2
        upper = (height - target_size[1]) // 2
        right = left + target_size[0]
        lower = upper + target_size[1]

        # crop the image
        cropped_image = image.crop((left, upper, right, lower))
        output_path = os.path.join(cropped_images_folder, category, file)
        cropped_image.save(output_path)
        image.close()

Amount of images in analysis dataset:
	COVID			 : 4596
         Lung_Opacity		 : 6012
         Normal			 : 10192
         Viral_Pneumonia	 : 2857

Maximum amount of images to use in the training:  2857


In [4]:
"""
-- Normalize pixel values and create training and testing datasets --

This cell normalizes the pixel values and creates a training
and testing datasets considering the stratify technique.
"""

parent_folder_path = 'images/cropped_dataset/'
categories = {'COVID': 0, 'Lung_Opacity': 1, 'Normal': 2, 'Viral_Pneumonia': 3}
arrays = []
category_amount = []

# get the category with the least images
for category in categories.keys():
    folder_path = os.path.join(parent_folder_path, category)
    image_files = os.listdir(folder_path)
    category_amount.append(len(image_files))

max_training = min(category_amount)

# convert the images into a pytorch dataset
for cat_folder, value in categories.items():

    folder_path = os.path.join(parent_folder_path, cat_folder)
    image_files = os.listdir(folder_path)

    for i, file_name in enumerate(image_files):

        if i >= max_training: break
        file_path = os.path.join(folder_path, file_name)
        image = Image.open(file_path)
        image_array = np.array(image)

        # verify all images are of the desired size
        if image.size != (250, 250):
            print(file_path, " IS NOT 250x250, it is: ", image.size)
            continue

        if image_array.shape != (250, 250):
            image_array = np.dot(image_array[..., :3], [0.2989, 0.5870, 0.1140])

        arrays.append(image_array)

# reshape the array
arrays = arrays/ np.max(arrays)
image_data = np.stack(arrays, axis=0)
image_data = image_data.reshape(len(image_data), 1, 250, 250)
image_data = torch.from_numpy(image_data).to(torch.float32)

# Stratify - get a random amount of values from
train_perc = 0.8

train_size = int(train_perc * max_training)
test_size = max_training - train_size

# Get the training and testing dataset
train_dataset = []
test_dataset = []
train_labels = []
test_labels = []

for category, value in categories.items():
    train_cat_dataset, test_cat_dataset = random_split(image_data[value*max_training:(value+1)*max_training], [train_size, test_size])
    train_dataset.append(train_cat_dataset)
    test_dataset.append(test_cat_dataset)
    train_cat_labels = [value] * train_size
    train_labels += train_cat_labels
    test_cat_labels = [value] * test_size
    test_labels += test_cat_labels

# Convert the lists into the types needed to train the models
train_dataset = ConcatDataset(train_dataset)
test_dataset = ConcatDataset(test_dataset)
train_labels = torch.from_numpy(np.array(train_labels)).to(torch.long)
test_labels = torch.from_numpy(np.array(test_labels))

### Feature extractor

### Filtro - Bilateral filter

## Prueba con Perceptrón multicapa (MLP)

En esta sección se realiza lo siguiente:
- se define el modelo de MLP
- se entrena el modelo sin feature extractor
- se prueba el modelo sin feature extractor
- se entrena el modelo con feature extractor
- se prueba el modelo con feature extractor

## Prueba con Red Convolucional 

En esta sección se realiza lo siguiente:
- se define el modelo de CNN
- se entrena el modelo con las imágenes sin filtro
- se prueba el modelo con las imágenes sin filtro
- se entrena el modelo con las imágenes con filtro
- se prueba el modelo con las imágenes con filtro

In [7]:
"""
-- Define the CNN module --

This cell defines a CNN module class of a network that has 2 internal
convolutional layers. Also some training and testing functions were
implemented.
"""

class CNN(nn.Module):
    def __init__(self, device, image_sizes= 299, kernel_size=3, max_pool_size=2, lr=0.001, epochs=3):
        """
        CNN contructor
        inputs:
        device - device in which the cnn will be executed
        kernel_size (hiperparameter) - this is the size that will be used in the convolution layers
            type: int, a single number will work with a grid of size n x n
        max_pool_size - this is the size of the pooling layer
        lr (hiperparameter) - the learning rate that will be used to train the cnn
        epochs (hiperparmeter) - the amount of iterations that will be used to train the cnn
        """
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=kernel_size, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=max_pool_size, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=kernel_size, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=max_pool_size, stride=2)
        self.fc1 = nn.Linear(32 * (image_sizes//(max_pool_size**2))**2, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 4)

        self.lr = lr
        self.epochs = epochs
        self.device = device


    def forward(self, x):
        """
        The forward function is used in the training to understand the infrastructure
        of the proposed model
        """
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x
    
    def train_cnn(self, train_dataset, train_labels):
        """
        Method to train the cnn based on the inputs:
        train_dataset - tensor with the information of the pixels of the images
        train_labels - tesnro with the category of each of the images frrom the train_dataset
        """
        # Set the loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.parameters(), self.lr)

        # Create a data loader for the training dataset
        train_loader = DataLoader(dataset=list(zip(train_dataset, train_labels)), batch_size=16, shuffle=True)

        self.train()
        for epoch in range(self.epochs):
            running_loss = 0.0
            for images, labels in train_loader:
                images = images.to(self.device)
                labels = labels.to(self.device)

                optimizer.zero_grad()

                outputs = self(images)
                loss = criterion(outputs, labels)

                loss.backward()
                optimizer.step()

            running_loss += loss.item() * images.size(0)

            epoch_loss = running_loss / len(train_dataset)
            #print("Running loss: ", running_loss)
            #print(f"Epoch [{epoch+1}/{self.epochs}], Loss: {epoch_loss:.4f}")

        print(f"Finished training with lr={self.lr} and epochs={self.epochs}")

    def predict(self, test_dataset):
        """
        Method used to predict the outputs of the test_dataset
        with the model.
        Input: test_dataset -> tensor with inputs of every test image
        """
        predictions = []
        self.eval() 
        # Create a data loader for the training dataset
        dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

        with torch.no_grad(): 
            for inputs in dataloader:
                # Forward pass through the model to obtain predictions
                outputs = self(inputs)
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.tolist())
        
        return predictions


In [10]:
"""
-- Train the CNN with raw images --

This cell uses the raw images (without any filter) to train the CNN
model. First the best hyperparameters are found using optuna library
and then the model with the best hyperparameters found is instantiated.
"""

def objective(trial):

    # define hyperparameters to be optimized
    lr = trial.suggest_float('lr', 0.00001, 0.001, log=True)
    epochs = trial.suggest_int('epochs', 5, 50)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CNN(epochs=epochs, lr=lr, device=device, image_sizes=250).to(device)

    model.train_cnn(train_dataset, train_labels)   
    predictions = model.predict(test_dataset)
    
    accuracy = accuracy_score(test_labels, predictions)

    return accuracy

# Create optuna study and optimize the objetive function
begin_time = time.time()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
finish_time = time.time()

optuna_time = finish_time - begin_time
print(f"Time taken to find best hyperparams -> {optuna_time} s")
# Print the best hyperparameters and the best objective value
best_params = study.best_params
best_value = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best Accuracy: ", best_value)


# Instantiate the model with the best hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

begin_time = time.time()
model = CNN(epochs=study.best_params['epochs'], lr=study.best_params['lr'], device=device, image_sizes=250).to(device)
model.train_cnn(train_dataset, train_labels)
finish_time = time.time()

training_time = finish_time - begin_time
print(f"Time taken in training -> {training_time} s")

[32m[I 2023-05-19 09:07:53,254][0m A new study created in memory with name: no-name-a55bda56-4703-4be3-9919-a4a32df25c33[0m


Finished training with lr=1.730437906951078e-05 and epochs=43


[32m[I 2023-05-19 11:52:01,157][0m Trial 0 finished with value: 0.2521853146853147 and parameters: {'lr': 1.730437906951078e-05, 'epochs': 43}. Best is trial 0 with value: 0.2521853146853147.[0m


Finished training with lr=4.319882669481712e-05 and epochs=15


[32m[I 2023-05-19 12:48:06,469][0m Trial 1 finished with value: 0.24694055944055945 and parameters: {'lr': 4.319882669481712e-05, 'epochs': 15}. Best is trial 0 with value: 0.2521853146853147.[0m
[33m[W 2023-05-19 13:55:59,060][0m Trial 2 failed with parameters: {'lr': 1.2628353623207541e-05, 'epochs': 38} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\Juan Navarro\AppData\Local\Programs\Python\Python311\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Juan Navarro\AppData\Local\Temp\ipykernel_7104\3184732779.py", line 18, in objective
    model.train_cnn(train_dataset, train_labels)
  File "C:\Users\Juan Navarro\AppData\Local\Temp\ipykernel_7104\682899009.py", line 79, in train_cnn
    loss.backward()
  File "c:\Users\Juan Navarro\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\_tensor.py", line 487

KeyboardInterrupt: 

In [None]:
"""
-- Test the CNN model with raw images --

This cell traing the model previously trained with the test dataset.
It gets the predictiosn and then compares the results using metrics
like accuracy, precission, recall, f1 and roc/auc.
"""

predictions = model.predict(test_dataset)
print("Model Preductions: \n", predictions)
print("Real categories: \n", test_labels)

# get the metrics
print(confusion_matrix(test_labels, predictions))
print("Accuracy: ", accuracy_score(test_labels, predictions))
print("Precision: ", precision_score(test_labels, predictions, average=None))
print("Recall: ", recall_score(test_labels, predictions, average=None))
print("F1 score: ", f1_score(test_labels, predictions, average=None))

# for auc and roc there is an analysis for each category
# get the accurate predictions matrix
test_label_mat = []
predictions_mat = []

covid_true = []
lung_op_true = []
normal_true = []
viral_pneu_true = []

covid_pred = []
lung_op_pred = []
normal_pred = []
viral_pneu_pred = []

for i in range(len(test_labels)):
    
    # y_test
    covid_true.append(test_labels[i] == 0)
    lung_op_true.append(test_labels[i] == 1)
    normal_true.append(test_labels[i] == 2)
    viral_pneu_true.append(test_labels[i] == 3)

    # predictions
    covid_pred.append(predictions[i] == 0)
    lung_op_pred.append(predictions[i] == 1)
    normal_pred.append(predictions[i] == 2)
    viral_pneu_pred.append(predictions[i] == 3)

test_label_mat.append(covid_true)
test_label_mat.append(lung_op_true)
test_label_mat.append(normal_true)
test_label_mat.append(viral_pneu_true)

predictions_mat.append(covid_pred)
predictions_mat.append(lung_op_pred)
predictions_mat.append(normal_pred)
predictions_mat.append(viral_pneu_pred)

# print the results and make the needed graphics
auc = roc_auc_score(test_label_mat, predictions_mat, multi_class='ovo')
print("General AUC:", auc)

categories = {'COVID': 0, 'Lung_Opacity': 1, 'Normal': 2, 'Viral_Pneumonia': 3}

for category, value in categories.items():

    fpr, tpr, thresholds = roc_curve(test_label_mat[value], predictions_mat[value])

    auc = roc_auc_score(test_label_mat[value], predictions_mat[value])

    plt.plot(fpr, tpr, label=f'ROC Curve {category} (AUC = %0.2f)' % auc)
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line representing the random classifier
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC) Curve for {category}')
    plt.legend(loc='lower right')
    plt.show()

## Visualización de Mapas de Calor