In [1]:
import gc
import os
import random
#import wandb
import math
import copy

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import f1_score,roc_auc_score

In [2]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
set_seed(seed=42)

In [4]:

df=pd.read_csv("/home/fate/covid19_CT/output/df_224_v2.csv")

In [5]:
df.head()

Unnamed: 0,path,embed,split,label,ct_path,ct_slice,ct_len
0,/home/fate/covid19_CT/input/train_pure_crop/co...,[ 1.14589073e-02 -9.61555656e-07 6.51998341e-...,train,1,/home/fate/covid19_CT/input/train_pure_crop/co...,0,247
1,/home/fate/covid19_CT/input/train_pure_crop/co...,[ 1.2717491e-02 -9.9315776e-07 7.3297793e-01 ...,train,1,/home/fate/covid19_CT/input/train_pure_crop/co...,1,247
2,/home/fate/covid19_CT/input/train_pure_crop/co...,[ 1.23448148e-02 -9.46207706e-07 7.25499094e-...,train,1,/home/fate/covid19_CT/input/train_pure_crop/co...,2,247
3,/home/fate/covid19_CT/input/train_pure_crop/co...,[ 1.05626313e-02 -7.19927925e-07 6.25274718e-...,train,1,/home/fate/covid19_CT/input/train_pure_crop/co...,3,247
4,/home/fate/covid19_CT/input/train_pure_crop/co...,[ 1.07890451e-02 -7.35935487e-07 6.57803416e-...,train,1,/home/fate/covid19_CT/input/train_pure_crop/co...,4,247


In [7]:
def array_convert(string):

    A = string.replace('[', '').replace(']', '').replace('\n', '').split(' ')
    new_array = np.array([float(x) for x in A if len(x) > 0])

    return new_array

In [8]:
df['embed'] = df['embed'].apply(array_convert)

In [10]:
ct_name=df["ct_path"].unique()

In [11]:
label_dic={}
for name in ct_name:
    tmp_label=df[df["ct_path"]==name].label.values[0]
    label_dic[name]=tmp_label

In [12]:
len(label_dic)

2474

In [13]:
class config:

    SEED = 0
    
    LR = 3e-5 
    N_EPOCHS = 50
    ct_len=120

    HIDDEN_SIZE = 128
    BS = 32
    WEIGHT_DECAY = 1e-3

In [14]:
class Covid19Dataset(torch.utils.data.Dataset):
    
    def __init__(self, df,label_dic):
        self.df=df
        self.dfs = [_df for _, _df in self.df.groupby("ct_path")]
        self.label_dic = label_dic

    def __getitem__(self, index):


        tmp_df = self.dfs[index]
        ct_name=tmp_df.ct_path.values[0]
        label=label_dic[ct_name]

        random.seed(4019)
        if len(tmp_df) >= config.ct_len:
            
            temp_index = [x for x in range(len(tmp_df))]
            target_index = random.sample(temp_index, k = config.ct_len)

        elif len(tmp_df) < config.ct_len:
            target_index = [x for x in range(len(tmp_df))]
            temp = random.choices(target_index, k = config.ct_len - len(target_index))
            target_index += temp
            
        target_index_sort=sorted(target_index)
        
        embed = tmp_df.iloc[target_index_sort, 1].values
        img = np.array([])
        img_list=[]

        for i_embed in embed:
            img_list.append(i_embed.reshape(1,-1))
            
        img=np.concatenate(img_list,axis=0)
        if img.shape!=(config.ct_len,224):
            print(img.shape)

        
        d = {
            "X": torch.tensor(img).float(),
            "y" : torch.tensor(label).long(),

        }
        
        return d

    def __len__(self):
        return len(self.dfs)

In [15]:
def loss_fn(outputs, labels):
    return nn.BCEWithLogitsLoss()(outputs, labels)

In [16]:
class Covid19Model(nn.Module):
    
    def __init__(self):
        super(Covid19Model, self).__init__()

        
        self.lstm = nn.LSTM(224, config.HIDDEN_SIZE, batch_first=True, bidirectional=True, dropout=0.0, num_layers=4)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.15)
        self.head = nn.Sequential(
            nn.Linear(config.HIDDEN_SIZE * 2, config.HIDDEN_SIZE//2 ),
            nn.LayerNorm(config.HIDDEN_SIZE//2 ),
            nn.ReLU(),
        
        )
        self.head2=nn.Linear(config.HIDDEN_SIZE//2 , 1)


    def forward(self, X, y):


        _,(hidden_state,_) = self.lstm(X, None) 
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        out = self.dropout1(hidden)
        logits = self.head(out)
        logits=self.dropout2(logits)
        logits=self.head2(logits)


            
        return logits
    

In [17]:
def train_loop(model, optimizer, scheduler, loader):
    losses, lrs = [], []
    model.train()
    optimizer.zero_grad()
    for d in loader:
        out = model(d['X'].to(device),d['y'].to(device))

        loss = loss_fn(out.view(-1),d['y'].to(device,dtype=torch.float))
        
        losses.append(loss.item())
        step_lr = np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean()
        lrs.append(step_lr)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    return np.array(losses).mean(), np.array(lrs).mean()


def valid_loop(model, loader):
    losses, predicts = [], []
    true_y=[]
    pred_y=[]
    model.eval()
    for d in loader:
        with torch.no_grad():
            images = d['X'].to(device, dtype=torch.float)
            labels = d['y'].to(device, dtype=torch.float)
            out = model(images, labels)
            loss = loss_fn(out.view(-1),labels)
        losses.append(loss.item())
        predicts.append(out.cpu())
        true_y.append(labels.cpu().numpy())
        pred_y.append(torch.sigmoid(out).cpu().numpy())
        
    true_y=np.concatenate(true_y)
    pred_y=np.concatenate(pred_y)
    
    gc.collect()
    
    true_y=np.array(true_y).reshape(-1,1)
    pred_y=np.array(pred_y).reshape(-1,1)
   
    acc_f1=f1_score(np.array(true_y),np.round(pred_y),average='macro')

    auc_roc=roc_auc_score(np.array(true_y),np.array(pred_y))
    print("acc_f1 : ",round(acc_f1,4),"  auc_roc : ",round(auc_roc,4))
    
    return np.array(losses).mean(),acc_f1,auc_roc



In [18]:
train_df=df[df["split"]=="train"].reset_index(drop=True)
valid_df=df[df["split"]=="valid"].reset_index(drop=True)


train_dset = Covid19Dataset(train_df, label_dic)
valid_dset = Covid19Dataset(valid_df, label_dic)

set_seed()
train_loader = DataLoader(train_dset, batch_size=config.BS,
                          pin_memory=True, shuffle=True, drop_last=True, num_workers=8,
                          worker_init_fn=lambda x: set_seed())
valid_loader = DataLoader(valid_dset, batch_size=config.BS,
                          pin_memory=True, shuffle=False, drop_last=False, num_workers=8)

In [19]:

device = torch.device("cuda")

In [20]:
model = Covid19Model()
model.to(device)

Covid19Model(
  (lstm): LSTM(224, 128, num_layers=4, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.15, inplace=False)
  (head): Sequential(
    (0): Linear(in_features=256, out_features=64, bias=True)
    (1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (2): ReLU()
  )
  (head2): Linear(in_features=64, out_features=1, bias=True)
)

In [21]:
def main():
    



    optimizer = AdamW(model.parameters(), lr=config.LR, weight_decay=config.WEIGHT_DECAY)
    num_train_steps = int(len(train_loader) * config.N_EPOCHS)
    num_warmup_steps = int(num_train_steps / 10)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)



    model_path1 = "/home/fate/covid19_CT/model/lstm/f1_best_model.bin"

    best_epoch_f1 = 0
    for epoch in tqdm(range(config.N_EPOCHS)):
        train_loss, lrs = train_loop(model, optimizer, scheduler, train_loader)
        valid_loss,acc_f1,auc_roc = valid_loop(model, valid_loader)

        if acc_f1 > best_epoch_f1:
            print(f"Validation f1 Improved ({best_epoch_f1} ---> {acc_f1})")
            best_epoch_f1 = acc_f1
            #run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = model_path1
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
    torch.cuda.empty_cache()
    gc.collect()
    print(best_epoch_f1)

In [None]:
if __name__ == "__main__":
    main()

In [23]:
def valid_one(model, loader):
    losses, predicts = [], []
    true_y=[]
    pred_y=[]
    model.eval()
    for d in loader:
        with torch.no_grad():
            images = d['X'].to(device, dtype=torch.float)
            labels = d['y'].to(device, dtype=torch.float)
            out = model(images, labels)
            loss = loss_fn(out.view(-1),labels)
        losses.append(loss.item())
        predicts.append(out.cpu())
        true_y.append(labels.cpu().numpy())
        pred_y.append(torch.sigmoid(out).cpu().numpy())
        
    true_y=np.concatenate(true_y)
    pred_y=np.concatenate(pred_y)
    
    gc.collect()
    
    true_y=np.array(true_y).reshape(-1,1)
    pred_y=np.array(pred_y).reshape(-1,1)
   

    return true_y,pred_y

In [31]:
valid_loader = DataLoader(valid_dset, batch_size=config.BS,
                          pin_memory=True, shuffle=False, drop_last=False, num_workers=8)


In [32]:

pred_path="/home/fate/covid19_CT/model/lstm/f1_best_model.bin"
model.load_state_dict(torch.load(pred_path))
model.to(device);

In [33]:
true_y,pred_y=valid_one(model,valid_loader)

In [34]:
acc_f1=f1_score(np.array(true_y),np.round(pred_y),average='macro')
print(acc_f1)

0.9326315789473684


In [35]:
from sklearn.metrics import accuracy_score
print("pos:",accuracy_score(np.array(true_y[:210]),np.round(pred_y[:210])))
print("neg:",accuracy_score(np.array(true_y[210:]),np.round(pred_y[210:])))

pos: 0.9
neg: 0.9598540145985401
