In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
job=51  

In [3]:
CONFIG = {"seed": 2022,
          "epochs": 50,  #24
          "img_size": 256, #512


          "train_batch_size": 8, #16
          "valid_batch_size": 32,
          "learning_rate": 0.0001,

          "weight_decay": 0.0005, 
    
          "n_accumulate": 1, #2
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),

          
          "train_batch":16,
          
          }

In [4]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [5]:
train_df=pd.read_csv("/home/fate/covid19_CT/input/df_train_pure.csv")
valid_df=pd.read_csv("/home/fate/covid19_CT/input/df_valid_pure.csv")

In [6]:
valid_df["path"][0]

'/home/fate/covid19_CT/input/valid_pure/covid/ct_scan_152'

In [7]:
train_df["path"]=train_df.path.str.replace("train_pure","train_pure_crop")
valid_df["path"]=valid_df.path.str.replace("valid_pure","valid_pure_crop")

In [8]:
# valid_df

In [9]:
import pickle

In [10]:
with open('/home/fate/covid19_CT/output/train_dic1_05.pickle', 'rb') as f:
    train_dic = pickle.load(f)

In [11]:
with open('/home/fate/covid19_CT/output/valid_dic1_05.pickle', 'rb') as f:
    valid_dic = pickle.load(f)

In [12]:
# valid_dic

In [13]:
class Covid19Dataset(Dataset):
    def __init__(self, df,train_batch=10,transforms=None):
        self.df = df
        self.file_names = df['filename'].values
        self.path = df['path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        self.img_batch=train_batch
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        label = self.labels[index]
        img_path = self.path[index]
        img_path_l = os.listdir(img_path)
        img_path_l_ = [file[2:] if file.startswith("._") else file for file in img_path_l]
        
        img_list = [int(i.split('.')[0]) for i in img_path_l_]
        index_sort = sorted(range(len(img_list)), key=lambda k: img_list[k])
        
        ct_len = len(img_list)
        
#         if label==0:
#             start_idx = 0
#             end_idx = ct_len
#         else:
#             start_idx,end_idx=train_dic[img_path]
        
        start_idx,end_idx=train_dic[img_path]
        


        img_sample = torch.zeros((self.img_batch, 3, 256, 256))
        label_sample=torch.zeros((self.img_batch, 1))

        
        if (end_idx-start_idx) > self.img_batch:
            sample_idx = random.sample(range(start_idx, end_idx),self.img_batch)
        elif ct_len>20:
            sample_idx = [random.choice(range(start_idx, end_idx)) for _ in range(self.img_batch)]
            
        else:
            sample_idx = [random.choice(range(ct_len)) for _ in range(self.img_batch)]
        
        for count, idx in enumerate(sample_idx):

            img_path_ = os.path.join(img_path, img_path_l_[index_sort[idx]])
            
            img = cv2.imread(img_path_)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            img = self.transforms(image=img)['image']
            
            
            img_sample[count] = img[:]
            label_sample[count]= label
        return {
            'image': img_sample,
            'label': torch.tensor(label_sample, dtype=torch.long)
        }
        
        
        
        
        


In [14]:
class Covid19Dataset_valid(Dataset):
    def __init__(self, df,train_batch=10,transforms=None):
        self.df = df
        self.file_names = df['filename'].values
        self.path = df['path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        self.img_batch=train_batch
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        label = self.labels[index]
        img_path = self.path[index]
        img_path_l = os.listdir(img_path)
        img_path_l_ = [file[2:] if file.startswith("._") else file for file in img_path_l]
        
        img_list = [int(i.split('.')[0]) for i in img_path_l_]
        index_sort = sorted(range(len(img_list)), key=lambda k: img_list[k])
        ct_len = len(img_list)


        
        start_idx,end_idx=valid_dic[img_path]
        


        img_sample = torch.zeros((self.img_batch, 3, 256, 256))
        label_sample=torch.zeros((self.img_batch, 1))

        
        if ct_len>40:
            sample_idx = random.sample(range(start_idx, end_idx),self.img_batch)
        elif ct_len>20:
            sample_idx = [random.choice(range(start_idx, end_idx)) for _ in range(self.img_batch)]
        else:
            sample_idx = [random.choice(range(ct_len)) for _ in range(self.img_batch)]
        
        for count, idx in enumerate(sample_idx):

            img_path_ = os.path.join(img_path, img_path_l_[index_sort[idx]])
            
            img = cv2.imread(img_path_)
          
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            img = self.transforms(image=img)['image']
          
            
            img_sample[count] = img[:]
            label_sample[count]= label
        return {
            'image': img_sample,
            'label': torch.tensor(label_sample, dtype=torch.long)
        }
        
        
        
        
        


In [15]:
class Covid19Dataset_valid1(Dataset):
    def __init__(self, df,train_batch=10,transforms=None):
        self.df = df
        self.file_names = df['filename'].values
        self.path = df['path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        self.img_batch=train_batch
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        label = self.labels[index]
        img_path = self.path[index]
        img_path_l = os.listdir(img_path)
        img_path_l_ = [file[2:] if file.startswith("._") else file for file in img_path_l]
        
        img_list = [int(i.split('.')[0]) for i in img_path_l_]
        index_sort = sorted(range(len(img_list)), key=lambda k: img_list[k])
        ct_len = len(img_list)


        
        start_idx,end_idx=valid_dic[img_path]
        


        img_sample = torch.zeros((self.img_batch, 3, 256, 256))
        label_sample=torch.zeros((self.img_batch, 1))

        
        if (end_idx-start_idx) > self.img_batch:
            sample_idx = random.sample(range(start_idx, end_idx),self.img_batch)
        elif ct_len>20:
            sample_idx = [random.choice(range(start_idx, end_idx)) for _ in range(self.img_batch)]
        else:
            sample_idx = [random.choice(range(ct_len)) for _ in range(self.img_batch)]
        
        for count, idx in enumerate(sample_idx):

            img_path_ = os.path.join(img_path, img_path_l_[index_sort[idx]])
            
            img = cv2.imread(img_path_)
          
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            img = self.transforms(image=img)['image']
          
            
            img_sample[count] = img[:]
            label_sample[count]= label
        return {
            'image': img_sample,
            'label': torch.tensor(label_sample, dtype=torch.long)
        }
        
        
        
        
        


In [16]:
data_transforms = {
    "train": A.Compose([
        A.HorizontalFlip(p=0.5),

        A.Resize(256, 256),


        
        
        A.ShiftScaleRotate(shift_limit=0.2, 
                           scale_limit=0.2, 
                           rotate_limit=30, 
                           
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5 
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.2,0.2), #0.2
                contrast_limit=(-0.2, 0.2),  #0.2
                p=0.5 
            ),
        A.Normalize(),
    
        

        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(256, 256),

        A.Normalize(),
        ToTensorV2()], p=1.)
}

In [17]:
def prepare_loaders():


  

    train_dataset = Covid19Dataset(train_df,CONFIG['train_batch'], transforms=data_transforms["train"])
    valid_dataset = Covid19Dataset_valid(valid_df,CONFIG['train_batch'], transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG["train_batch_size"], 
                              num_workers=8, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG["valid_batch_size"], 
                              num_workers=8, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [18]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        e = efficientnet_b3a(pretrained=True, drop_rate=0.3, drop_path_rate=0.2)
        self.b0 = nn.Sequential(
            e.conv_stem,
            e.bn1,
            e.act1,
        )
        self.b1 = e.blocks[0]
        self.b2 = e.blocks[1]
        self.b3 = e.blocks[2]
        self.b4 = e.blocks[3]
        self.b5 = e.blocks[4]
        self.b6 = e.blocks[5]
        self.b7 = e.blocks[6]
        self.b8 = nn.Sequential(
            e.conv_head, #384, 1536
            e.bn2,
            e.act2,
        )

        
        
        self.emb = nn.Linear(1536,224)
        self.logit = nn.Linear(224,1)
        

    def forward(self, image):
        batch_size = len(image)
        x = 2*image-1     

        x = self.b0(x) 
        x = self.b1(x) 
        x = self.b2(x)
        x = self.b3(x) 
        x = self.b4(x) 
        x = self.b5(x) 

        x = self.b6(x) 
        x = self.b7(x) 
        x = self.b8(x) 
        x = F.adaptive_avg_pool2d(x,1).reshape(batch_size,-1)

        x = self.emb(x)
        logit = self.logit(x)
     
        return logit



In [19]:
def criterion(outputs, labels):
    return nn.BCEWithLogitsLoss()(outputs, labels)

In [20]:
model=Net()

model=model.cuda()
scaler = amp.GradScaler()

In [21]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ct_b, img_b, c, h, w = data['image'].size()
        data_img = data['image'].reshape(-1, c, h, w)
        data_label = data['label'].reshape(-1,1)
        images = data_img.to(device, dtype=torch.float)
        labels = data_label.to(device, dtype=torch.float)

        
        batch_size = images.size(0)
        
        with amp.autocast(enabled = True):
            outputs = model(images)
            
            loss = criterion(outputs, labels)
            
            loss = loss / CONFIG['n_accumulate']

        scaler.scale(loss).backward()
        
        if (step + 1) % CONFIG['n_accumulate'] == 0:

            scaler.unscale_(optimizer)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()



            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [22]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    true_y=[]
    pred_y=[]
    for step, data in bar:
        ct_b, img_b, c, h, w = data['image'].size()
        data_img = data['image'].reshape(-1, c, h, w)
        data_label = data['label'].reshape(-1,1)
        
        images = data_img.to(device, dtype=torch.float)
        labels = data_label.to(device, dtype=torch.float)
        
        batch_size = images.size(0)

        outputs = model(images)
        loss = criterion(outputs, labels)

        
        true_y.append(labels.cpu().numpy())
        pred_y.append(torch.sigmoid(outputs).cpu().numpy())
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
        
    

    true_y=np.concatenate(true_y)
    pred_y=np.concatenate(pred_y)
    
    
    
   
    gc.collect()
    
    true_y=np.array(true_y).reshape(-1,1)
    true_y=np.array(true_y).reshape(-1,img_b)
    true_y=true_y.mean(axis=1)
  
  
    pred_y=np.array(pred_y).reshape(-1,1)
    pred_y=np.array(pred_y).reshape(-1,img_b)
#     pred_y2=pred_y.max(axis=1)
    pred_y=pred_y.mean(axis=1)
    
    
   
    acc_f1=f1_score(np.array(true_y),np.round(pred_y),average='macro')
    acc_f1_48=f1_score(np.array(true_y),np.where(pred_y>0.48,1,0),average='macro')
    acc_f1_51=f1_score(np.array(true_y),np.where(pred_y>0.51,1,0),average='macro')
    acc_f1_52=f1_score(np.array(true_y),np.where(pred_y>0.52,1,0),average='macro')
    acc_f1_54=f1_score(np.array(true_y),np.where(pred_y>0.54,1,0),average='macro')
    auc_roc=roc_auc_score(np.array(true_y),np.array(pred_y))
    print("acc_f1(mean) : ",round(acc_f1,4),"  auc_roc(mean) : ",round(auc_roc,4))


    return epoch_loss,acc_f1,auc_roc

In [23]:

def run_training(model, optimizer, scheduler, device, num_epochs):

    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    best_epoch_auc = 0
    best_epoch_f1 = 0
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss,acc_f1,auc_roc= valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        

        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
 
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = '/home/fate/covid19_CT/model/loss/'+f"job_{job}_effnetb3a"+".bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
        if auc_roc >= best_epoch_auc:
            print(f"Validation Auc Improved ({best_epoch_auc} ---> {auc_roc})")
            best_epoch_auc = auc_roc
            
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = '/home/fate/covid19_CT/model/auc_roc/'+f"job_{job}_effnetb3a"+".bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
        if acc_f1 >= best_epoch_f1:
            print(f"Validation f1 Improved ({best_epoch_f1} ---> {acc_f1})")
            best_epoch_f1 = acc_f1
         
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = '/home/fate/covid19_CT/model/f1/'+f"job_{job}_effnetb3a"+".bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
            

    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    

    
    return model, history

In [24]:
train_loader, valid_loader = prepare_loaders()

In [25]:
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])


In [None]:
model, history= run_training(model, optimizer,None,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

In [28]:
pred_path='/home/fate/covid19_CT/model/f1/'+f"job_{job}_effnetb3a"+".bin"

In [29]:
model.load_state_dict(torch.load(pred_path))
model.to(CONFIG['device']);

In [30]:
@torch.inference_mode()
def pred_one(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    true_y=[]
    pred_y=[]
    for step, data in bar:
        ct_b, img_b, c, h, w = data['image'].size()
        data_img = data['image'].reshape(-1, c, h, w)
        data_label = data['label'].reshape(-1,1)
        
        images = data_img.to(device, dtype=torch.float)
        labels = data_label.to(device, dtype=torch.float)
        
        batch_size = images.size(0)

        outputs = model(images)
        loss = criterion(outputs, labels)

        
        true_y.append(labels.cpu().numpy())
        pred_y.append(torch.sigmoid(outputs).cpu().numpy())
        

    

    true_y=np.concatenate(true_y)
    pred_y=np.concatenate(pred_y)
    
    
    
   
    gc.collect()
    
    true_y=np.array(true_y).reshape(-1,1)
    true_y=np.array(true_y).reshape(-1,img_b)
    true_y=true_y.mean(axis=1)
  
  
    pred_y=np.array(pred_y).reshape(-1,1)
    pred_y=np.array(pred_y).reshape(-1,img_b)

    pred_y=pred_y.mean(axis=1)
    
    return true_y,pred_y
    

In [35]:
total_pred=[]
for i in range(10):
    true_y,pred_y=pred_one(model, valid_loader, device=CONFIG['device'])
    total_pred.append(pred_y)

100%|██████████| 16/16 [00:11<00:00,  1.41it/s]
100%|██████████| 16/16 [00:09<00:00,  1.67it/s]
100%|██████████| 16/16 [00:10<00:00,  1.59it/s]
100%|██████████| 16/16 [00:10<00:00,  1.58it/s]
100%|██████████| 16/16 [00:10<00:00,  1.53it/s]
100%|██████████| 16/16 [00:10<00:00,  1.50it/s]
100%|██████████| 16/16 [00:10<00:00,  1.57it/s]
100%|██████████| 16/16 [00:10<00:00,  1.53it/s]
100%|██████████| 16/16 [00:10<00:00,  1.56it/s]
100%|██████████| 16/16 [00:09<00:00,  1.66it/s]


In [36]:
for i in range(len(total_pred)):
    
    print(f1_score(np.array(true_y),np.round(total_pred[i]),average='macro'))

0.932707681612791
0.9347740087205638
0.9260955871125363
0.932707681612791
0.9368421052631579
0.9347740087205638
0.9347740087205638
0.9263577517812817
0.9431902656598459
0.9155897382235476


In [37]:
print(f1_score(np.array(true_y),np.round(np.mean(total_pred,axis=0)),average='macro'))

0.9368421052631579
