# COIVD19 FineTuning CXR Data

In [2]:
# Author: Sasank Chilamkurthy

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import numpy as np
from openpyxl import Workbook

plt.ion()   # interactive mode

In [3]:
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.ColorJitter(brightness=(0.8, 1.2), contrast=(0.8, 1.3)),
        transforms.RandomAffine(degrees=0, translate=(0.15,0.1)),
        transforms.RandomAffine(degrees=(-10,10)),
        transforms.Resize(280),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        #transforms.Resize(size=(224,224)),
        transforms.Resize(280),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = os.path.join(os.getenv('HOME'),'covid_data/CXR')
batch_size = 32
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
val_class_names = image_datasets['val'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [4]:
print("Training dataset size: " + str(dataset_sizes['train']))
print("Validation dataset size: " +str(dataset_sizes['val']))
print(val_class_names)

Training dataset size: 13955
Validation dataset size: 1578
['covid-19', 'normal', 'pneumonia']


In [7]:
val_dir = os.path.join(os.getenv('HOME'),'covid_data/CXR (train-test)/val/')
covid_data_dir = val_dir + 'covid-19'
normal_data_dir = val_dir + 'normal'
pneumonia_data_dir = val_dir + 'pneumonia'

In [8]:
print("######### Validation Dataset #########")
val_covid_num = len(os.listdir(covid_data_dir))
val_normal_num = len(os.listdir(normal_data_dir))
val_pneumonia_num = len(os.listdir(pneumonia_data_dir))
print("covid-19 size: " + str(val_covid_num))
print("normal size: " + str(val_normal_num))
print("pneumonia size: " + str(val_pneumonia_num))


######### Validation Dataset #########
covid-19 size: 99
normal size: 885
pneumonia size: 594


In [9]:
wb = Workbook()      # 워크북을 생성한다.
ws = wb.active       # 워크 시트를 얻는다.
    
ws['A1'] = 'ResNet50'
ws['B1'] = 'Val ACC'
ws['C1'] = 'Covid ACC'
ws['D1'] = 'Normal ACC'
ws['E1'] = 'Pneumonia ACC'

In [16]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_covid_acc = 0.0
    best_normal_acc = 0.0
    best_pneumonia_acc = 0.0

    val_iteration = dataset_sizes['val']/batch_size
    
    for epoch in range(num_epochs):
        val_covid_acc = 0.0
        val_normal_acc= 0.0
        val_pneumonia_acc = 0.0

        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        A = 'A'+str(epoch+2)
        ws[A] = 'Epoch' + str(epoch+1)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    '''
                    print(labels.data)
                    print(labels.data[0])
                    print(inputs.size(0))
                    print(labels.data[0].item())
                    '''
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                a = preds.size()
                b = a[0]

                if phase == 'val':
                    for i in range(b):
                        if preds[i].item() == labels.data[i].item():
                            if labels.data[i].item() == 0:
                                val_covid_acc += 1
                            elif labels.data[i].item() == 1:
                                val_normal_acc += 1
                            elif labels.data[i].item() == 2:
                                val_pneumonia_acc += 1   
               
                

            if phase == 'train':
                scheduler.step()
                

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            if phase == 'val':
                B = 'B'+str(epoch+2)
                ws[B] = '{:.4f}'.format(epoch_acc)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))    


            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        covid_acc = val_covid_acc/val_covid_num
        normal_acc = val_normal_acc/val_normal_num
        pneumonia_acc = val_pneumonia_acc/val_pneumonia_num
        
        C = 'C'+str(epoch+2)
        D = 'D'+str(epoch+2)
        E = 'E'+str(epoch+2)

        ws[C] = '{:.4f}'.format(covid_acc)
        ws[D] = '{:.4f}'.format(normal_acc)
        ws[E] = '{:.4f}'.format(pneumonia_acc)

        print('Covid ACC per epcoh: ' + str(covid_acc))
        print('Normal ACC per epoch: ' + str(normal_acc))
        print('Pneumonia ACC per epoch: ' + str(pneumonia_acc)) 

        if phase == 'val' and covid_acc > best_covid_acc:
            best_covid_acc = covid_acc

        if phase == 'val' and normal_acc > best_normal_acc:
            best_normal_acc = normal_acc
        
        if phase == 'val' and pneumonia_acc > best_pneumonia_acc:
            best_pneumonia_acc = pneumonia_acc

        
        print()

    

    time_elapsed = time.time() - since
    print("#######################################")
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    print('Covid ACC: ' + str(best_covid_acc))
    print('Normal ACC: ' + str(best_normal_acc))
    print('Pneumonia ACC: ' + str(best_pneumonia_acc)) 

    # load best model weights
    model.load_state_dict(best_model_wts)
    wb.save('CXR/batch32.xlsx') # 엑셀로 저장한다. 
    return model

In [17]:
model_ft = models.resnet50(pretrained=True)
for param in model_ft.parameters():
    param.requires_grad = True
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 3)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) 

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

In [18]:
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=20)

Epoch 0/19
----------
train Loss: 0.3573 Acc: 0.8717
val Loss: 0.2273 Acc: 0.9233
Covid ACC per epcoh: 0.6565656565656566
Normal ACC per epoch: 0.9457627118644067
Pneumonia ACC per epoch: 0.9343434343434344

Epoch 1/19
----------
train Loss: 0.2479 Acc: 0.9112
val Loss: 0.2089 Acc: 0.9309
Covid ACC per epcoh: 0.7272727272727273
Normal ACC per epoch: 0.9457627118644067
Pneumonia ACC per epoch: 0.9427609427609428

Epoch 2/19
----------
train Loss: 0.2068 Acc: 0.9276
val Loss: 0.3559 Acc: 0.8447
Covid ACC per epcoh: 0.7575757575757576
Normal ACC per epoch: 0.7672316384180791
Pneumonia ACC per epoch: 0.9747474747474747

Epoch 3/19
----------
train Loss: 0.1572 Acc: 0.9448
val Loss: 0.1652 Acc: 0.9474
Covid ACC per epcoh: 0.7676767676767676
Normal ACC per epoch: 0.9887005649717514
Pneumonia ACC per epoch: 0.9158249158249159

Epoch 4/19
----------
train Loss: 0.1403 Acc: 0.9502
val Loss: 0.1386 Acc: 0.9531
Covid ACC per epcoh: 0.8686868686868687
Normal ACC per epoch: 0.976271186440678
Pneumo