## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
torch.cuda.is_available()

False

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [4]:
CFG = {
    'NUM_WORKERS':4,
    'ANTIGEN_WINDOW':128,
    'ANTIGEN_MAX_LEN':128, # ANTIGEN_WINDOW와 ANTIGEN_MAX_LEN은 같아야합니다.
    'EPITOPE_MAX_LEN':256,
    'EPOCHS':20,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':2048,
    'THRESHOLD':0.5,
    'SEED':41
}

## Fixed RandomSeed

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [64]:
def get_preprocessing(data_type, new_df):
    alpha_map = {
                'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5,
                'G':6, 'H':7, 'I':8, 'J':9, 'K':10, 'L':11,
                'M':12, 'N':13, 'O':14, 'P':15, 'Q':16, 'R':17,
                'S':18, 'T':19, 'U':20, 'V':21, 'W':22, 'X':23,
                'Y':24, 'Z':25, '<PAD>':26,
            }

    epitope_list = []
    left_antigen_list = []
    right_antigen_list = []
    
    for epitope, antigen, s_p, e_p in tqdm(zip(new_df['epitope_seq'], new_df['antigen_seq'], new_df['start_position'], new_df['end_position'])):
        epitope_pad = [26 for _ in range(CFG['EPITOPE_MAX_LEN'])] # 길이 맞춰주기 위한 패딩. 모두 같은 feature를 가져야 함
        left_antigen_pad = [26 for _ in range(CFG['ANTIGEN_MAX_LEN'])]
        right_antigen_pad = [26 for _ in range(CFG['ANTIGEN_MAX_LEN'])]
        
        epitope = [alpha_map[x] for x in epitope]
        
        # Left antigen : [start_position-WINDOW : start_position]
        # Right antigen : [end_position : end_position+WINDOW]

        start_position = s_p-CFG['ANTIGEN_WINDOW']-1
        end_position = e_p+CFG['ANTIGEN_WINDOW']
        if start_position < 0:
            start_position = 0
        if end_position > len(antigen):
            end_position = len(antigen)
        
        # left / right antigen sequence 추출
        left_antigen = antigen[int(start_position) : int(s_p)-1]
        left_antigen = [alpha_map[x] for x in left_antigen]
        
        right_antigen = antigen[int(e_p) : int(end_position)]
        right_antigen = [alpha_map[x] for x in right_antigen]

        if CFG['EPITOPE_MAX_LEN']<len(epitope):
            epitope_pad[:len(epitope)] = epitope[:CFG['EPITOPE_MAX_LEN']]
        else:
            epitope_pad[:len(epitope)] = epitope[:]

        left_antigen_pad[:len(left_antigen)] = left_antigen[:]
        right_antigen_pad[:len(right_antigen)] = right_antigen[:]
        
        epitope_list.append(epitope_pad)
        left_antigen_list.append(left_antigen_pad)
        right_antigen_list.append(right_antigen_pad)
    
    label_list = None
    if data_type != 'test':
        label_list = []
        for label in new_df['label']:
            label_list.append(label)
    print(f'{data_type} dataframe preprocessing was done.')
    return epitope_list, left_antigen_list, right_antigen_list, label_list

In [65]:
all_df = pd.read_csv('./data/open/train.csv')
# Split Train : Validation = 0.8 : 0.2
train_len = int(len(all_df)*0.8)
train_df = all_df.iloc[:train_len]
val_df = all_df.iloc[train_len:]

all_df

Unnamed: 0,id,epitope_seq,antigen_seq,antigen_code,start_position,end_position,number_of_tested,number_of_responses,assay_method_technique,assay_group,disease_type,disease_state,reference_date,reference_journal,reference_title,reference_IRI,qualitative_label,label
0,200001,KGILSN,AFKGILSNADIKAAEAACFKEGSFDEDGFYAKVGLDAFSADELKKL...,P02622.1,3.0,8.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,allergic disease,1976,Int Arch Allergy Appl Immunol,The allergenic structure of allergen M from co...,http://www.iedb.org/reference/1005599,Positive,1
1,200002,SNADIK,AFKGILSNADIKAAEAACFKEGSFDEDGFYAKVGLDAFSADELKKL...,P02622.1,7.0,12.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,allergic disease,1976,Int Arch Allergy Appl Immunol,The allergenic structure of allergen M from co...,http://www.iedb.org/reference/1005599,Positive,1
2,200003,EGSFDEDGFYAKVGLDAFSADELK,AFKGILSNADIKAAEAACFKEGSFDEDGFYAKVGLDAFSADELKKL...,P02622.1,21.0,44.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,allergic disease,1976,Int Arch Allergy Appl Immunol,The allergenic structure of allergen M from co...,http://www.iedb.org/reference/1005599,Positive,1
3,200004,SFDEDGFY,AFKGILSNADIKAAEAACFKEGSFDEDGFYAKVGLDAFSADELKKL...,P02622.1,23.0,30.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,allergic disease,1976,Int Arch Allergy Appl Immunol,The allergenic structure of allergen M from co...,http://www.iedb.org/reference/1005599,Positive,1
4,200005,DEDGFY,AFKGILSNADIKAAEAACFKEGSFDEDGFYAKVGLDAFSADELKKL...,P02622.1,25.0,30.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,allergic disease,1976,Int Arch Allergy Appl Immunol,The allergenic structure of allergen M from co...,http://www.iedb.org/reference/1005599,Positive,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190806,390807,QQFKRELRNLPQQCGLRAPQRCDLEVESGGRDRY,MAKLTILVALALFLLAAHASARQQWELQGDRRCQSQLERANLRPCE...,Q6PSU2.2,139.0,172.0,21.0,0.0,ELISA,qualitative binding,Occurrence of allergy,peanut allergy,2022,J Allergy Clin Immunol,Immunodominant conformational and linear IgE e...,http://www.iedb.org/reference/1040321,Negative,0
190807,390808,RCMCEALQQIMENQSDRLQGRQQE,MAKLTILVALALFLLAAHASARQQWELQGDRRCQSQLERANLRPCE...,Q6PSU2.2,115.0,138.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,peanut allergy,2022,J Allergy Clin Immunol,Immunodominant conformational and linear IgE e...,http://www.iedb.org/reference/1040321,Positive,1
190808,390809,QRDEDSYGRDPYSPSQDPYSPSPYDRRGAGSSQHQERCCNELNEFENNQ,MAKLTILVALALFLLAAHASARQQWELQGDRRCQSQLERANLRPCE...,QHN95793.1,54.0,102.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,peanut allergy,2022,J Allergy Clin Immunol,Immunodominant conformational and linear IgE e...,http://www.iedb.org/reference/1040321,Positive,1
190809,390810,RQQWELQGDRRCQSQLERANLRPCEQHLMQKI,MAKLTILVALALFLLAAHASARQQWELQGDRRCQSQLERANLRPCE...,Q6PSU2.2,22.0,53.0,,,antigen inhibition,qualitative binding,Occurrence of allergy,peanut allergy,2022,J Allergy Clin Immunol,Immunodominant conformational and linear IgE e...,http://www.iedb.org/reference/1040321,Positive,1


In [66]:
train_epitope_list, train_left_antigen_list, train_right_antigen_list, train_label_list = get_preprocessing('train', train_df)
val_epitope_list, val_left_antigen_list, val_right_antigen_list, val_label_list = get_preprocessing('val', val_df)

152648it [00:05, 29204.30it/s]


train dataframe preprocessing was done.


38163it [00:02, 17658.28it/s]

val dataframe preprocessing was done.





## CustomDataset

In [67]:
class CustomDataset(Dataset):
    def __init__(self, epitope_list, left_antigen_list, right_antigen_list, label_list):
        self.epitope_list = epitope_list
        self.left_antigen_list = left_antigen_list
        self.right_antigen_list = right_antigen_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        self.epitope = self.epitope_list[index]
        self.left_antigen = self.left_antigen_list[index]
        self.right_antigen = self.right_antigen_list[index]
        
        if self.label_list is not None:
            self.label = self.label_list[index]
            return torch.tensor(self.epitope), torch.tensor(self.left_antigen), torch.tensor(self.right_antigen), self.label
        else:
            return torch.tensor(self.epitope), torch.tensor(self.left_antigen), torch.tensor(self.right_antigen)
        
    def __len__(self):
        return len(self.epitope_list)

In [68]:
train_dataset = CustomDataset(train_epitope_list, train_left_antigen_list, train_right_antigen_list, train_label_list)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)

val_dataset = CustomDataset(val_epitope_list, val_left_antigen_list, val_right_antigen_list, val_label_list)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [69]:
emb_node = 15
hidden_dim = 64

## Model Define

In [70]:
class BaseModel(nn.Module):
    def __init__(self,
                 epitope_length=CFG['EPITOPE_MAX_LEN'],
                 epitope_emb_node=emb_node,
                 epitope_hidden_dim=hidden_dim,
                 left_antigen_length=CFG['ANTIGEN_MAX_LEN'],
                 left_antigen_emb_node=emb_node,
                 left_antigen_hidden_dim=hidden_dim,
                 right_antigen_length=CFG['ANTIGEN_MAX_LEN'],
                 right_antigen_emb_node=emb_node,
                 right_antigen_hidden_dim=hidden_dim,
                 lstm_bidirect=True
                ):
        super(BaseModel, self).__init__()
        # Embedding Layer
        self.epitope_embed = nn.Embedding(num_embeddings=27, # 0 ~ 26 까지 숫자로 맵핑했으므로
                                          embedding_dim=epitope_emb_node, 
                                          padding_idx=26
                                         )
        self.left_antigen_embed = nn.Embedding(num_embeddings=27,
                                          embedding_dim=left_antigen_emb_node, 
                                          padding_idx=26
                                         )
        self.right_antigen_embed = nn.Embedding(num_embeddings=27,
                                          embedding_dim=right_antigen_emb_node, 
                                          padding_idx=26
                                         )
        # LSTM
        self.epitope_lstm = nn.LSTM(input_size=epitope_emb_node, 
                                    hidden_size=epitope_hidden_dim, 
                                    batch_first=True, 
                                    bidirectional=lstm_bidirect
                                   )
        self.left_antigen_lstm = nn.LSTM(input_size=left_antigen_emb_node, 
                                    hidden_size=left_antigen_hidden_dim, 
                                    batch_first=True, 
                                    bidirectional=lstm_bidirect
                                   )
        self.right_antigen_lstm = nn.LSTM(input_size=right_antigen_emb_node, 
                                    hidden_size=right_antigen_hidden_dim, 
                                    batch_first=True, 
                                    bidirectional=lstm_bidirect
                                   )

        # Classifier
        if lstm_bidirect:
            in_channels = 2*(epitope_hidden_dim+left_antigen_hidden_dim+right_antigen_hidden_dim)
        else:
            in_channels = epitope_hidden_dim+left_antigen_hidden_dim+right_antigen_hidden_dim
        
        # dropout = torch.nn.Dropout(p=0.2)
        # self.classifier = nn.Sequential(
        #     nn.LeakyReLU(True),
        #     nn.BatchNorm1d(in_channels),
        #     dropout,
        #     nn.Linear(in_channels, in_channels//4),
        #     nn.LeakyReLU(True),
        #     nn.BatchNorm1d(in_channels//4),
        #     dropout,
        #     nn.Linear(in_channels//4, 1)
        # )
        
    def forward(self, epitope_x, left_antigen_x, right_antigen_x):
        BATCH_SIZE = epitope_x.size(0)
        # Get Embedding Vector
        epitope_x = self.epitope_embed(epitope_x)
        left_antigen_x = self.left_antigen_embed(left_antigen_x)
        right_antigen_x = self.right_antigen_embed(right_antigen_x)
        
        # LSTM
        epitope_hidden, _ = self.epitope_lstm(epitope_x)
        epitope_hidden = epitope_hidden[:,-1,:] # output dimension은 (batch, time_step, hidden dimension) 순이다. 양방향일 경우 hidden_size*2

        left_antigen_hidden, _ = self.left_antigen_lstm(left_antigen_x)
        left_antigen_hidden = left_antigen_hidden[:,-1,:]
        
        right_antigen_hidden, _ = self.right_antigen_lstm(right_antigen_x)
        right_antigen_hidden = right_antigen_hidden[:,-1,:]
        
        # Feature Concat -> Binary Classifier
        x = torch.cat([epitope_hidden, left_antigen_hidden, right_antigen_hidden], axis=-1)
        # x = self.classifier(x).view(-1)
        return x

In [71]:
def classifier(output, is_train):
    in_channels = 2 * (hidden_dim * 3)
    dropout = torch.nn.Dropout(p=0.2)
    train_classifier = nn.Sequential(
        nn.LeakyReLU(True),
        nn.BatchNorm1d(in_channels),
        dropout,
        nn.Linear(in_channels, in_channels//4),
        nn.LeakyReLU(True),
        nn.BatchNorm1d(in_channels//4),
        dropout,
        nn.Linear(in_channels//4, 1)
    )

    val_classifier = nn.Sequential(
        nn.LeakyReLU(True),
        nn.BatchNorm1d(in_channels),
        nn.Linear(in_channels, in_channels//4),
        nn.LeakyReLU(True),
        nn.BatchNorm1d(in_channels//4),
        nn.Linear(in_channels//4, 1)
    )

    if (is_train):
        output = train_classifier(output).view(-1)
    else :
        output = val_classifier(output).view(-1)
    return output

## Train

In [72]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device) 
    
    best_val_f1 = 0
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for epitope_seq, left_antigen_seq, right_antigen_seq, label in tqdm(iter(train_loader)):
            epitope_seq = epitope_seq.to(device)
            left_antigen_seq = left_antigen_seq.to(device)
            right_antigen_seq = right_antigen_seq.to(device)
            label = label.float().to(device)
            
            optimizer.zero_grad()
            
            output = model(epitope_seq, left_antigen_seq, right_antigen_seq)
            output = classifier(output, is_train=True)
            loss = criterion(output, label)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            
            if scheduler is not None:
                scheduler.step()
                    
        val_loss, val_f1 = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val F1 : [{val_f1:.5f}]')
        
        if best_val_f1 < val_f1:
            best_val_f1 = val_f1
            torch.save(model.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
            print('Model Saved.')
    return best_val_f1

In [73]:
def validation(model, val_loader, criterion, device):
    model.eval()
    pred_proba_label = []
    true_label = []
    val_loss = []
    with torch.no_grad():
        for epitope_seq, left_antigen_seq, right_antigen_seq, label in tqdm(iter(val_loader)):
            epitope_seq = epitope_seq.to(device)
            left_antigen_seq = left_antigen_seq.to(device)
            right_antigen_seq = right_antigen_seq.to(device)
            label = label.float().to(device)
            
            model_pred = model(epitope_seq, left_antigen_seq, right_antigen_seq)
            model_pred = classifier(model_pred, is_train=False)
            loss = criterion(model_pred, label)
            model_pred = torch.sigmoid(model_pred).to('cpu')
            
            pred_proba_label += model_pred.tolist()
            true_label += label.to('cpu').tolist()
            
            val_loss.append(loss.item())
            
    pred_label = np.where(np.array(pred_proba_label)>CFG['THRESHOLD'], 1, 0)
    val_f1 = f1_score(true_label, pred_label, average='macro')
    return np.mean(val_loss), val_f1

## Run!!

In [74]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader)*CFG['EPOCHS'], eta_min=0)

best_score = train(model, optimizer, train_loader, val_loader, scheduler, device)
print(f'Best Validation F1 Score : [{best_score:.5f}]')

100%|██████████| 75/75 [12:48<00:00, 10.25s/it]
100%|██████████| 19/19 [01:46<00:00,  5.58s/it]


Epoch : [1] Train Loss : [0.74082] Val Loss : [0.72378] Val F1 : [0.44084]
Model Saved.


 28%|██▊       | 21/75 [04:16<10:58, 12.19s/it]


KeyboardInterrupt: 

## Inference

In [52]:
test_df = pd.read_csv('./data/open/test.csv')
test_epitope_list, test_left_antigen_list, test_right_antigen_list, test_label_list = get_preprocessing('test', test_df)

120944it [00:04, 26991.22it/s]

test dataframe preprocessing was done.





In [53]:
test_dataset = CustomDataset(test_epitope_list, test_left_antigen_list, test_right_antigen_list, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [54]:
model = BaseModel()
best_checkpoint = torch.load('./best_model.pth')
model.load_state_dict(best_checkpoint)
model.eval()
model.to(device)

BaseModel(
  (epitope_embed): Embedding(27, 15, padding_idx=26)
  (left_antigen_embed): Embedding(27, 15, padding_idx=26)
  (right_antigen_embed): Embedding(27, 15, padding_idx=26)
  (epitope_lstm): LSTM(15, 64, batch_first=True, bidirectional=True)
  (left_antigen_lstm): LSTM(15, 64, batch_first=True, bidirectional=True)
  (right_antigen_lstm): LSTM(15, 64, batch_first=True, bidirectional=True)
  (classifier): Sequential(
    (0): LeakyReLU(negative_slope=True)
    (1): BatchNorm1d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Linear(in_features=384, out_features=96, bias=True)
    (3): LeakyReLU(negative_slope=True)
    (4): BatchNorm1d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): Linear(in_features=96, out_features=1, bias=True)
  )
)

In [55]:
def inference(model, test_loader, device):
    model.eval()
    pred_proba_label = []
    with torch.no_grad():
        for epitope_seq, left_antigen_seq, right_antigen_seq in tqdm(iter(test_loader)):
            epitope_seq = epitope_seq.to(device)
            left_antigen_seq = left_antigen_seq.to(device)
            right_antigen_seq = right_antigen_seq.to(device)
            
            model_pred = model(epitope_seq, left_antigen_seq, right_antigen_seq)
            model_pred = torch.sigmoid(model_pred).to('cpu')
            
            pred_proba_label += model_pred.tolist()
    
    pred_label = np.where(np.array(pred_proba_label)>CFG['THRESHOLD'], 1, 0)
    return pred_label

In [56]:
preds = inference(model, test_loader, device)

100%|██████████| 60/60 [05:15<00:00,  5.26s/it]


## Submission

In [57]:
submit = pd.read_csv('./data/open/sample_submission.csv')
submit['label'] = preds

In [58]:
submit.to_csv('./result/submit.csv', index=False)
print('Done.')

Done.
