In [None]:
# https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia
!unzip '/content/drive/MyDrive/Colab Notebooks/kaggle/data/x-ray.zip'

In [3]:
data_path = '/content/chest_xray/'
train_path = data_path + 'train/'
test_path = data_path + 'test/'
valid_path = data_path + 'val/'

In [None]:
from glob import glob
print(len(glob(train_path + "NORMAL/*.*"))  + len(glob(train_path + "PNEUMONIA/*.*")))
print(len(glob(valid_path + "NORMAL/*.*"))  + len(glob(valid_path + "PNEUMONIA/*.*")))
print(len(glob(test_path + "NORMAL/*.*"))  + len(glob(test_path + "PNEUMONIA/*.*")))

5216
16
624


In [None]:
# 검증 데이터가 너무 작다 
# 타깃을 기준으로 데이터의 편향을 조사
normal_imgs = []
pneumonia_img = []

# NORMAL
len(glob(glob(train_path + "*")[0] + '/*.*')) + len(glob(glob(valid_path + "*")[0] + '/*.*')) + len(glob(glob(test_path + "*")[0] + '/*.*'))

1583

In [None]:
# PNEUMONIA
len(glob(glob(train_path + "*")[1] + '/*.*')) + len(glob(glob(valid_path + "*")[1] + '/*.*')) + len(glob(glob(test_path + "*")[1] + '/*.*'))

4273

In [None]:
# 검증 훈련 데이터를 제공하므로 분리필요 없음
# 검증데이터가 너무 작아서 검증데이터 성능 점수를 지나치게 신뢰할 수 없다
# 이미지크기가 같지 않다. - 크기 조정
# 모델 : efficientnet-b0
# 옵티 : Adam

# 성능개선 : efficientnet-b1, efficientnet-b2, efficientnet-b3 --> 앙상블
# 옵티 : AdamW

In [None]:
# 시드값 고정 ->데이터준비[이미지변환기, 데이터셋,  데이터로더]
#  -> 모델생성 -> 훈련및 검증(손실함수,옵티마이저,훈련함수,훈련및검증) -> 예측(예측함수작성)

In [4]:
import torch
import random
import numpy as np
import os

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.enabled=False

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
# 이미지 변환기
from torchvision import transforms
transform_train = transforms.Compose([
    transforms.Resize((250,250)),
    transforms.CenterCrop(180),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.2),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    transforms.Normalize( (0.485,0.456,0.406),(0.229,0.224,0.225))
])

In [7]:
from torchvision.datasets import ImageFolder
dataset_train = ImageFolder(root = train_path, transform=transform_train)
dataset_valid = ImageFolder(root = valid_path, transform=transform_train)

In [8]:
from torch.utils.data import DataLoader
batch_size  = 8
loader_train = DataLoader(dataset_train,batch_size = batch_size,shuffle = True)
loader_valid = DataLoader(dataset_valid,batch_size = batch_size,shuffle = False)

In [9]:
# EfficientNet 모델 생성
!pip install efficientnet-pytorch==0.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting efficientnet-pytorch==0.7.1
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet-pytorch
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet-pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16444 sha256=28beaf0cf61c540ae48382fa58e3d36c77d290f3a72c44b55620bf9f85dbd878
  Stored in directory: /root/.cache/pip/wheels/29/16/24/752e89d88d333af39a288421e64d613b5f652918e39ef1f8e3
Successfully built efficientnet-pytorch
Installing collected packages: efficientnet-pytorch
Successfully installed efficientnet-pytorch-0.7.1


In [10]:
# b0모델을 불러오기
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=2)
model = model.to(device)

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b0-355c32eb.pth
100%|██████████| 20.4M/20.4M [00:00<00:00, 48.6MB/s]


Loaded pretrained weights for efficientnet-b0


In [None]:
# numel()  텐서 객체가 갖는 구성요서의 총 개수
sum([param.numel() for param in model.parameters()])
# y = ax + b  : 파라메터 a, b 2개

4010110

In [11]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
opti = torch.optim.Adam(model.parameters(), lr=0.01)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score # 재현율
from sklearn.metrics import f1_score
from tqdm import tqdm

In [13]:
def train(model, loader_train, loader_valid,criterion,opti,scheduler=None, epochs=5, save_file='model.pth'):
  valid_loss_min = np.inf
  for epoch in range(epochs):
    print(f'epoch [{epoch+1} / {epochs}]\n')
    model.train()
    epoch_train_loss = 0
    for images, labels in tqdm(loader_train):
      images = images.to(device)
      labels = labels.to(device)
      opti.zero_grad()
      outputs =  model(images)
      loss = criterion(outputs,labels)
      epoch_train_loss += loss.item()
      loss.backward()
      opti.step()
      if scheduler !=None:
        scheduler.step()
    print(f'loss : {epoch_train_loss/len(loader_train)}')
    # 검증
    model.eval()
    epoch_eval_loss = 0
    #  예측값 저장용 실제값 저장용
    pred_lists, true_lists = [],[]
    with torch.no_grad():
      for images, labels in loader_valid:
        images = images.to(device)
        labels = labels.to(device)
        # opti.zero_grad()
        outputs =  model(images)
        loss = criterion(outputs,labels)
        epoch_eval_loss += loss.item()
        preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
        true = labels.cpu().numpy()
        pred_lists.extend(preds)
        true_lists.extend(true)
        # loss.backward()
        # opti.step()
        # if scheduler !=None:
        #   scheduler.step()
    val_acc = accuracy_score(true_lists,pred_lists)
    val_recall = recall_score(true_lists, pred_lists)
    val_f1_score = f1_score(true_lists, pred_lists)

    print(f'loss : {epoch_eval_loss/len(loader_valid)}')
    print(f'acc:{val_acc}  recall:{val_recall} val_f1:{val_f1_score}')
    
    if epoch_eval_loss <= valid_loss_min:
      valid_loss_min = epoch_eval_loss
      torch.save(model.state_dict(),save_file)
  return torch.load(save_file)

In [None]:
# train
model2 = train(model,epochs=1, loader_train=loader_train, loader_valid=loader_valid, criterion=criterion,opti=opti)
model.load_state_dict(model2)  # 가중치 불러오기

In [28]:
# 예측
transform_test = transforms.Compose([
    transforms.Resize((250,250)),    
    transforms.ToTensor(),
    transforms.Normalize( (0.485,0.456,0.406),(0.229,0.224,0.225))
])
dataset_test = ImageFolder('/content/chest_xray/test', transform=transform_test)

In [29]:
loader_test = DataLoader(dataset_test,batch_size = batch_size,shuffle = False)

In [25]:
def predict(model,loader_test):
  model.eval()
  epoch_eval_loss = 0
  #  예측값 저장용 실제값 저장용
  pred_lists, true_lists = [],[]
  with torch.no_grad():
    for images, labels in loader_valid:
      images = images.to(device)
      labels = labels.to(device)      
      outputs =  model(images)            
      preds = torch.max(outputs.cpu(), dim=1)[1].numpy()
      true = labels.cpu().numpy()
      pred_lists.extend(preds)
      true_lists.extend(true)    
  return pred_lists  ,true_lists

In [None]:
pred_lists  ,true_lists = predict(model, loader_test)
val_acc = accuracy_score(true_lists,pred_lists)
val_recall = recall_score(true_lists, pred_lists)
val_f1_score = f1_score(true_lists, pred_lists)
print(f'acc:{val_acc}  recall:{val_recall} val_f1:{val_f1_score}')

acc:0.875  recall:0.75 val_f1:0.8571428571428571


In [17]:
# 개선 - 앙상블기법
model_b1 = EfficientNet.from_pretrained('efficientnet-b1', num_classes=2)
model_b2 = EfficientNet.from_pretrained('efficientnet-b2', num_classes=2)
model_b3 = EfficientNet.from_pretrained('efficientnet-b3', num_classes=2)

model_b1 = model_b1.to(device)
model_b2 = model_b2.to(device)
model_b3 = model_b3.to(device)

model_lists = []
model_lists.append(model_b1)
model_lists.append(model_b2)
model_lists.append(model_b3)

Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b1-f1951068.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b1-f1951068.pth
100%|██████████| 30.1M/30.1M [00:00<00:00, 61.4MB/s]


Loaded pretrained weights for efficientnet-b1


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b2-8bb594d6.pth
100%|██████████| 35.1M/35.1M [00:00<00:00, 87.6MB/s]


Loaded pretrained weights for efficientnet-b2


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b3-5fb5a3c3.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b3-5fb5a3c3.pth
100%|██████████| 47.1M/47.1M [00:00<00:00, 55.5MB/s]


Loaded pretrained weights for efficientnet-b3


In [18]:
#  손실함수는 위에 모델 재사용
# 옵티마이져는 각각
opti_b1 = torch.optim.AdamW(model_b1.parameters(), lr=0.0001, weight_decay=0.001)
opti_b2 = torch.optim.AdamW(model_b2.parameters(), lr=0.0001, weight_decay=0.001)
opti_b3 = torch.optim.AdamW(model_b3.parameters(), lr=0.0001, weight_decay=0.001)

In [None]:
# 스케줄러 생성
!pip install transformers

In [19]:
from transformers import get_cosine_schedule_with_warmup
epochs = 1  # 30
sch_b1 = get_cosine_schedule_with_warmup(opti_b1, num_warmup_steps=len(loader_train)*3,
                                         num_training_steps=len(loader_train)*epochs )
sch_b2 = get_cosine_schedule_with_warmup(opti_b2, num_warmup_steps=len(loader_train)*3,
                                         num_training_steps=len(loader_train)*epochs )
sch_b3 = get_cosine_schedule_with_warmup(opti_b3, num_warmup_steps=len(loader_train)*3,
                                         num_training_steps=len(loader_train)*epochs )


In [21]:
# 첫번째 train
param = train(model_b1,epochs=1, loader_train=loader_train, loader_valid=loader_valid, 
              scheduler=sch_b1 ,criterion=criterion,opti=opti_b1)
model_b1.load_state_dict(param)  # 가중치 불러오기

epoch [1 / 1]



100%|██████████| 652/652 [02:54<00:00,  3.73it/s]


loss : 0.4962855108836494
loss : 0.7436743099242449
acc:0.6875  recall:1.0 val_f1:0.761904761904762


<All keys matched successfully>

In [22]:
# 두번째 train
param = train(model_b2,epochs=1, loader_train=loader_train, loader_valid=loader_valid, 
              scheduler=sch_b2 ,criterion=criterion,opti=opti_b2)
model_b2.load_state_dict(param)  # 가중치 불러오기

epoch [1 / 1]



100%|██████████| 652/652 [02:49<00:00,  3.85it/s]


loss : 0.47431092296778427
loss : 0.7612986322492361
acc:0.625  recall:1.0 val_f1:0.7272727272727273


<All keys matched successfully>

In [23]:
# 세번째 train
param = train(model_b3,epochs=1, loader_train=loader_train, loader_valid=loader_valid, 
              scheduler=sch_b3 ,criterion=criterion,opti=opti_b3)
model_b3.load_state_dict(param)  # 가중치 불러오기

epoch [1 / 1]



100%|██████████| 652/652 [03:01<00:00,  3.59it/s]


loss : 0.428489580819029
loss : 0.7084123007953167
acc:0.5625  recall:1.0 val_f1:0.6956521739130436


<All keys matched successfully>

In [30]:
# 평가 3
val_lists,val_recalls,val_f1s = [],[],[]
for m in model_lists:
  pred_lists  ,true_lists = predict(m, loader_test)
  val_acc = accuracy_score(true_lists,pred_lists)
  val_recall = recall_score(true_lists, pred_lists)
  val_f1_score = f1_score(true_lists, pred_lists)
  print(f'acc:{val_acc}  recall:{val_recall} val_f1:{val_f1_score}')
  val_lists.append(val_acc)
  val_recalls.append(val_recall)
  val_f1s.append(val_f1_score)

acc:0.6875  recall:1.0 val_f1:0.761904761904762
acc:0.625  recall:1.0 val_f1:0.7272727272727273
acc:0.6875  recall:1.0 val_f1:0.761904761904762
