In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_path='../input/chest-xray-pneumonia/chest_xray/'

train_path = data_path + 'train/'
valid_path = data_path + 'val/'
test_path = data_path + 'test/'

In [None]:
from glob import glob

print(f'훈련 데이터 개수 " {len(glob(train_path + "*/*"))}')
print(f'검증 데이터 개수 " {len(glob(valid_path + "*/*"))}')
print(f'테스트 데이터 개수 " {len(glob(test_path + "*/*"))}')


In [None]:
glob(train_path + "*")

In [None]:
all_normal_imgs = []
all_pneumonia_imgs = []

for cat in ['train/', 'val/', 'test/'] :
    data_cat_path = data_path + cat
    
    normal_imgs = glob(data_cat_path + 'NORMAL/*')
    pneumonia_imgs = glob(data_cat_path + 'PNEUMONIA/*')
    all_normal_imgs.extend(normal_imgs)
    all_pneumonia_imgs.extend(pneumonia_imgs)

print(f'정상 흉부 이미지 개수 " {len(all_normal_imgs)}')
print(f'폐렴 흉부 이미지 개수 " {len(all_pneumonia_imgs)}')

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

mpl.rc('font', size=15)
plt.figure(figsize=(7, 7))

label = ['Normal', 'Pneumonia']

plt.pie([len(all_normal_imgs), len(all_pneumonia_imgs)], labels=label, autopct='%.1f%%')

In [None]:
import matplotlib.gridspec as gridspec
import cv2

def show_image(img_paths, rows=2, cols=3) :
    assert len(img_paths) <= rows*cols
    
    mpl.rc('font', size=8)
    plt.figure(figsize=(15, 8))
    grid = gridspec.GridSpec(rows, cols)
    
    for idx, img_path in enumerate(img_paths) :
        image = cv2.imread(img_path)
        ax = plt.subplot(grid[idx])
        ax.imshow(image)


In [None]:
num_of_imgs = 6
normal_img_paths = all_normal_imgs[-num_of_imgs:]

show_image(normal_img_paths)

In [None]:
pneumonia_img_paths = all_pneumonia_imgs[-num_of_imgs:]

show_image(pneumonia_img_paths)

In [None]:
import torch
import random
import numpy as np
import os

seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torchvision import transforms

transform_train = transforms.Compose([
    transforms.Resize((250, 250)),
    transforms.CenterCrop(180),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.2),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                        (0.229, 0.224, 0.225))
])

transform_test = transforms.Compose([
    transforms.Resize((250, 250)),
    transforms.CenterCrop(180),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                        (0.229, 0.224, 0.225))
])

In [None]:
from torchvision.datasets import ImageFolder

datasets_train = ImageFolder(root=train_path, transform=transform_train)
datasets_valid = ImageFolder(root=valid_path, transform=transform_test)

In [None]:
def seed_worker(worker_id) :
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

In [None]:
from torch.utils.data import DataLoader

batch_size = 8

loader_train = DataLoader(dataset=datasets_train, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g, num_workers=2)
loader_valid = DataLoader(dataset=datasets_valid, batch_size=batch_size, shuffle=True, worker_init_fn=seed_worker, generator=g, num_workers=2)


In [None]:
!pip install efficientnet-pytorch==0.7.1

In [None]:
from efficientnet_pytorch import EfficientNet

model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=2)
model = model.to(device)

In [None]:
print('모델 파라미터 개수 : ', sum(param.numel() for param in model.parameters()))

In [None]:
import torch.nn as nn

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
from tqdm.notebook import tqdm

def train(model, loader_train, loader_valid, criterion, optimizer, scheduler=None, epochs=10, save_file='model_state_dict.pth') :
    valid_loss_min = np.inf
    
    for epoch in range(epochs) :
        print(f'에폭 [{epoch+1}/{epochs}]\n--------------------------')
        model.train()
        epoch_train_loss = 0
        for images, labels in tqdm(loader_train) :
            images = images.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            epoch_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            if scheduler != None :
                scheduler.step()
        
        print(f'\t훈련 데이터 손실값 [{epoch_train_loss/len(loader_train):.4f}')
        
        model.eval()
        epoch_valid_loss = 0
        preds_list = []
        true_list = []
        
        with torch.no_grad() :
            for images, labels in loader_valid :
                images = images.to(device)
                labels = labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                epoch_valid_loss += loss.item()
                
                preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
                true = labels.cpu().numpy()
                
                preds_list.extend(preds)
                true_list.extend(true)
        
        print(f'\t검증이터 손실값 [{epoch_valid_loss/len(loader_valid):.4f}')      
        
        val_accuracy = accuracy_score(true_list, preds_list)
        val_recall = recall_score(true_list, preds_list)
        val_f1_score = f1_score(true_list, preds_list)
        
        print(f'\t정확도 : {val_accuracy:.4f} / 재현율 : {val_recall:.4f} / 정확도 : {val_f1_score:.4f} /')      
        
        if epoch_valid_loss <= valid_loss_min :
            print(f'\t### 검증 데이터 손실값 감소 ({valid_loss_min:.4f}-->{epoch_valid_loss:.4f}). 모델저장')
            
            torch.save(model.state_dict(), save_file)
            valid_loss_min = epoch_valid_loss
    
    return torch.load(save_file)
        

In [None]:
model_state_dict = train(model=model, loader_train=loader_train, loader_valid=loader_valid, criterion=criterion, optimizer=optimizer)

In [None]:
model.load_state_dict(model_state_dict)

In [None]:
datasets_test = ImageFolder(root=test_path, transform=transform_test)
loader_test = DataLoader(dataset=datasets_test, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=2)

In [None]:
def predict(model, loader_test, return_true=False) :
    model.eval()
    preds_list = []
    true_list = []
        
    with torch.no_grad() :
        for images, labels in loader_test :
            images = images.to(device)
            labels = labels.to(device)
                
            outputs = model(images)

            preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
            true = labels.cpu().numpy()
                
            preds_list.extend(preds)
            true_list.extend(true)
    if return_true :
        return true_list, preds_list
    else :
        return preds_list

In [None]:
true_list, preds_list = predict(model=model, loader_test=loader_test, return_true=True)

In [None]:
test_accuracy = accuracy_score(true_list, preds_list)
test_recall = recall_score(true_list, preds_list)
test_f1_score = f1_score(true_list, preds_list)
        
print(f'\t정확도 : {test_accuracy:.4f} / 재현율 : {test_recall:.4f} / 정확도 : {test_f1_score:.4f} /')     

In [None]:
models_list = []

efficientnet_b1 = EfficientNet.from_pretrained('efficientnet-b1', num_classes=2)
efficientnet_b1 = efficientnet_b1.to(device)
efficientnet_b2 = EfficientNet.from_pretrained('efficientnet-b2', num_classes=2)
efficientnet_b2 = efficientnet_b2.to(device)
efficientnet_b3 = EfficientNet.from_pretrained('efficientnet-b3', num_classes=2)
efficientnet_b3 = efficientnet_b3.to(device)

models_list.append(efficientnet_b1)
models_list.append(efficientnet_b2)
models_list.append(efficientnet_b3)

In [None]:
optimizer1 = torch.optim.AdamW(models_list[0].parameters(), lr=0.0006, weight_decay=0.001)
optimizer2 = torch.optim.AdamW(models_list[1].parameters(), lr=0.0006, weight_decay=0.001)
optimizer3 = torch.optim.AdamW(models_list[2].parameters(), lr=0.0006, weight_decay=0.001)

In [None]:
from transformers import get_cosine_schedule_with_warmup

epochs = 20

scheduler1 = get_cosine_schedule_with_warmup(optimizer1, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)
scheduler2 = get_cosine_schedule_with_warmup(optimizer2, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)
scheduler3 = get_cosine_schedule_with_warmup(optimizer3, num_warmup_steps=len(loader_train)*3, num_training_steps=len(loader_train)*epochs)

In [None]:
model_state_dict = train(model=models_list[0], loader_train=loader_train, loader_valid=loader_valid, criterion=criterion, optimizer=optimizer1, scheduler=scheduler1, epochs=epochs)
models_list[0].load_state_dict(model_state_dict)

In [None]:
model_state_dict = train(model=models_list[1], loader_train=loader_train, loader_valid=loader_valid, criterion=criterion, optimizer=optimizer2, scheduler=scheduler2, epochs=epochs)
models_list[1].load_state_dict(model_state_dict)

In [None]:
model_state_dict = train(model=models_list[2], loader_train=loader_train, loader_valid=loader_valid, criterion=criterion, optimizer=optimizer3, scheduler=scheduler3, epochs=epochs)
models_list[2].load_state_dict(model_state_dict)

In [None]:
true_list, preds_list1 = predict(model=models_list[0], loader_test=loader_test, return_true=True)

In [None]:
preds_list2 = predict(model=models_list[1], loader_test=loader_test)

In [None]:
preds_list3 = predict(model=models_list[2], loader_test=loader_test)

In [None]:
test_accuracy = accuracy_score(true_list, preds_list1)
test_recall = recall_score(true_list, preds_list1)
test_f1_score = f1_score(true_list, preds_list1)

print('efficientnet-b1 모델 예측 평가 점수')     
print(f'\t정확도 : {test_accuracy:.4f} / 재현율 : {test_recall:.4f} / 정확도 : {test_f1_score:.4f} /')     

In [None]:
test_accuracy = accuracy_score(true_list, preds_list2)
test_recall = recall_score(true_list, preds_list2)
test_f1_score = f1_score(true_list, preds_list2)

print('efficientnet-b2 모델 예측 평가 점수')     
print(f'\t정확도 : {test_accuracy:.4f} / 재현율 : {test_recall:.4f} / 정확도 : {test_f1_score:.4f} /')     

In [None]:
test_accuracy = accuracy_score(true_list, preds_list3)
test_recall = recall_score(true_list, preds_list3)
test_f1_score = f1_score(true_list, preds_list3)

print('efficientnet-b3 모델 예측 평가 점수')     
print(f'\t정확도 : {test_accuracy:.4f} / 재현율 : {test_recall:.4f} / 정확도 : {test_f1_score:.4f} /')     

In [None]:
ensemble_preds = []

for i in range(len(preds_list1)) :
    pred_element = np.round((preds_list1[i]+preds_list2[i]+preds_list3[i])/3)
    ensemble_preds.append(pred_element)

In [None]:
test_accuracy = accuracy_score(true_list, ensemble_preds)
test_recall = recall_score(true_list, ensemble_preds)
test_f1_score = f1_score(true_list, ensemble_preds)

print('최종 앙상블 모델 예측 평가 점수')     
print(f'\t정확도 : {test_accuracy:.4f} / 재현율 : {test_recall:.4f} / 정확도 : {test_f1_score:.4f} /')     