In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
import glob

In [2]:
job=51

In [3]:
tc="f1"

In [4]:
train_ct_all_list=list(glob.glob(r"/home/fate/covid19_CT/input/train_pure_crop/*/*/*")) 

In [5]:
len(train_ct_all_list)

433432

In [6]:
valid_ct_all_list=list(glob.glob(r"/home/fate/covid19_CT/input/valid_pure_crop/*/*/*")) 

In [7]:
all_ct_all_list=train_ct_all_list+valid_ct_all_list

In [8]:
df = pd.DataFrame(all_ct_all_list, columns = ['path'])

In [9]:
weights_path=f"/home/fate/covid19_CT/model/{tc}/job_{job}_effnetb3a.bin"

In [10]:
weights_path

'/home/fate/covid19_CT/model/f1/job_51_effnetb3a.bin'

In [11]:
CONFIG = {"seed": 2022,
          "epochs": 100,  #24
          "img_size": 256, #512


          "train_batch_size": 8, #16
          "valid_batch_size": 32,
          "learning_rate": 0.0001,
          "scheduler": 'onecycleLR',
          "min_lr": 1e-6,
          
          "weight_decay": 0.0005, #1e-6
    
          "n_accumulate": 1, #2
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),

          #'onecycleLR'
          "pct_start":0.1,
          "max_lr":0.000025,
          
          
          "train_batch":4,
          
          }

In [12]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [13]:
os.listdir("/home/fate/covid19_CT/input/train")

['covid', 'non_covid']

In [14]:
class Covid19Dataset(Dataset):
    def __init__(self, df,transforms=None):
        self.df = df
  
        self.path = df['path'].values
 
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
 

        img_path_ = self.path[index]
     

        img = cv2.imread(img_path_)
        try:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        except:
            print(img_path_ )
        img = self.transforms(image=img)['image']

            

        return {
            'image': img,

            'id' : img_path_ 
        }
            
        

        
        
        
        
        


In [15]:
data_transforms = {

    
    "valid": A.Compose([
        A.Resize(256, 256),

        A.Normalize(),
        ToTensorV2()], p=1.)
}

In [16]:
def prepare_loaders():


  

    train_dataset = Covid19Dataset(df, transforms=data_transforms["valid"])


    train_loader = DataLoader(train_dataset, batch_size=128, 
                              num_workers=16, shuffle=False, pin_memory=True)

    
    return train_loader

In [17]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        e = efficientnet_b3a(pretrained=True, drop_rate=0.3, drop_path_rate=0.2)
        self.b0 = nn.Sequential(
            e.conv_stem,
            e.bn1,
            e.act1,
        )
        self.b1 = e.blocks[0]
        self.b2 = e.blocks[1]
        self.b3 = e.blocks[2]
        self.b4 = e.blocks[3]
        self.b5 = e.blocks[4]
        self.b6 = e.blocks[5]
        self.b7 = e.blocks[6]
        self.b8 = nn.Sequential(
            e.conv_head, 
            e.bn2,
            e.act2,
        )

        
        self.emb = nn.Linear(1536,224)
        self.logit = nn.Linear(224,1)

    def forward(self, image):
        batch_size = len(image)
        x = 2*image-1     

        x = self.b0(x) 
        x = self.b1(x) 
        x = self.b2(x) 
        x = self.b3(x) 
        x = self.b4(x) 
        x = self.b5(x) 

        x = self.b6(x) 
        x = self.b7(x) 
        x = self.b8(x) 
        x = F.adaptive_avg_pool2d(x,1).reshape(batch_size,-1)

        x = self.emb(x)
        logit = self.logit(x)
        
        return x



In [18]:
model=Net()

model.load_state_dict(torch.load(weights_path))
model=model.cuda()


In [19]:
@torch.inference_mode()
def get_embeddings(model, dataloader, device):
    model.eval()
    
    LABELS = []
    EMBEDS = []
    IDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    
    for step, data in bar:  

        
    
        data_img = data['image']
    
        images = data_img.to(device, dtype=torch.float)



        ids = data['id']

        outputs = model(images)
        

        EMBEDS.append(outputs.cpu().numpy())
        IDS.append(ids)
    

    
    return EMBEDS, IDS

In [20]:
train_loader = prepare_loaders()

In [21]:
embed,name=get_embeddings(model,train_loader,CONFIG['device'])

100%|██████████| 4210/4210 [07:59<00:00,  8.78it/s]


In [22]:
embed = np.vstack(embed)

In [23]:
name = np.concatenate(name)

In [24]:
dict_all=dict(zip(name, embed))

In [25]:
df_224=pd.DataFrame(list(dict_all.items()),
                   columns=['path', 'embed'])

In [26]:
df_224["split"]=df_224.path.str.split("/").str[5]

In [27]:
df_224["split"]=df_224["split"].str.split("_").str[0]

In [28]:
df_224["label"]=df_224.path.str.split("/").str[6]
label_map={"covid":1,"non_covid":0}
df_224["label"]=df_224.label.map(label_map)

In [29]:
df_224["ct_path"]=df_224["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
df_224["ct_slice"]=df_224["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

In [30]:
df_224["ct_len"]=df_224.groupby(["ct_path"])["ct_slice"].transform('count')

In [31]:
df_224.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [32]:
df_224=df_224.reset_index(drop=True)

In [33]:
df_224.to_csv("/home/fate/covid19_CT/output/df_224_v2.csv",index=False)