In [1]:
from optimization import *
from configuration_bert import *
from tokenization_bert import *
from modeling_bert import *
from tokenization_dna import *
import pathlib
import pandas as pd
import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple
from copy import deepcopy
from multiprocessing import Pool

import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from preparation import *
from helper_functions import *
torch.cuda.empty_cache()

n_process = 1

MASK_LIST = [-1, 1, 2]

modeldir = "Diem_pretrained_model"
datadir = "dataset"

outdir = "."

path_to_main_output = os.path.join(outdir, "outputdir")
os.system("mkdir -p {}".format(path_to_main_output))

path_to_02_output = os.path.join(path_to_main_output, "02_output")
os.system("mkdir -p {}".format(path_to_02_output))
path_to_01_output = os.path.join(path_to_main_output, "01_output")

model = BertModel.from_pretrained(os.path.join(modeldir, "23082023_checkpoints", "checkpoint-9900"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


  from .autonotebook import tqdm as notebook_tqdm
INFO:loading configuration file /mnt/WORKDIR/hieunguyen/outdir/TCR_antigen_binding_transformers/pytorch/model_files/models/bert-large-cased-config.json
INFO:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_beams": 1,
  "num_hidden_layers": 24,
  "num_labels": 2,
  "num_return_sequences": 1,
  "num_rnn_layer": 1,
  "output_at

In [2]:
if os.path.exists(os.path.join(path_to_02_output, "universal_testset_new.stringOnly.label.txt")) == False:    
    df_full = pd.read_csv(os.path.join(datadir, "train_fullpeps.csv"))
    df_test = pd.read_csv(os.path.join(datadir, "universal_testset_new.csv"))
    
    df_full["string"] = df_full[["CDR3b", "epitope"]].apply(lambda x: "{} [SEP] {}".format(" ".join([item for item in x[0]]), 
                                                                                              " ".join([item for item in x[1]])), axis = 1)
    df_binding = df_full[df_full["binder"] == 1]
    df_nonbinding = df_full[df_full["binder"] == 0]
    
    df_binding["string"].to_csv(os.path.join(path_to_02_output, "train_fullpeps.binding.stringOnly.txt"), sep = "\t", header = False, index = False)
    df_nonbinding["string"].to_csv(os.path.join(path_to_02_output, "train_fullpeps.nonbinding.stringOnly.txt"), sep = "\t", header = False, index = False)
        
    df_test["string"] = df_test[["CDR3b", "epitope"]].apply(lambda x: "{} [SEP] {}".format(" ".join([item for item in x[0]]), 
                                                                                              " ".join([item for item in x[1]])), axis = 1)
    
    df_test["string"].to_csv(os.path.join(path_to_02_output, "universal_testset_new.stringOnly.txt"), sep = "\t", header = False, index = False)
    df_test["binder"].to_csv(os.path.join(path_to_02_output, "universal_testset_new.stringOnly.label.txt"), sep = "\t", header = False, index = False)
    

In [10]:
#####----------------------------------------------------------------#####
##### GENERATE .CSV FILE FOR NON-BINDING EMBEDDING VECTORS. 
#####----------------------------------------------------------------#####
if os.path.exists(os.path.join(path_to_02_output, "finished_generating_cls_embeddings_nonBinding.txt")) == False:
    batch_size = 32
    path_to_input_dataset = os.path.join(path_to_02_output, "train_fullpeps.nonbinding.stringOnly.txt")
    input_dataset = load_and_cache_examples(path_to_input_dataset, tokenizer, n_process = n_process)
    transform_dataloader = DataLoader(input_dataset, batch_size=batch_size, collate_fn=collate)
    cls_embeddings = np.empty((0, 1024))
    epoch_iterator = tqdm(transform_dataloader, desc="Iteration")
    
    finished_batch = pd.read_csv(os.path.join(path_to_02_output, "finished_batch_cls_embeddings_nonBinding.txt"))
    embedding_outputfile = os.path.join(path_to_02_output, "cls_embeddings_nonBinding.csv")
    for step, batch in enumerate(epoch_iterator):
        if step not in finished_batch["step"].to_list():
            batch = batch.to(device)
            model.eval()
            outputs = model(batch)
            last_hidden_states = outputs[0]  
            tmp_cls_embeddings = last_hidden_states[:, 0, :]
            tmp_cls_embeddings = tmp_cls_embeddings.cpu().detach().numpy()
            pd.DataFrame(tmp_cls_embeddings).to_csv(embedding_outputfile, mode = "a", header = False)
            os.system("echo {} >> {}".format(step, 
                                             os.path.join(path_to_02_output, "finished_batch_cls_embeddings_nonBinding.txt")))
    os.system("touch {}/finished_generating_cls_embeddings_nonBinding.txt".format(path_to_02_output))

#####----------------------------------------------------------------#####
##### GENERATE .CSV FILE FOR BINDING EMBEDDING VECTORS
#####----------------------------------------------------------------#####
if os.path.exists(os.path.join(path_to_02_output, "finished_generating_cls_embeddings_Binding.txt")) == False:
    batch_size = 32
    path_to_input_dataset = os.path.join(path_to_02_output, "train_fullpeps.binding.stringOnly.txt")
    input_dataset = load_and_cache_examples(path_to_input_dataset, tokenizer, n_process = n_process)
    transform_dataloader = DataLoader(input_dataset, batch_size=batch_size, collate_fn=collate)
    cls_embeddings = np.empty((0, 1024))
    epoch_iterator = tqdm(transform_dataloader, desc="Iteration")
    
    finished_batch = pd.read_csv(os.path.join(path_to_02_output, "finished_batch_cls_embeddings_Binding.txt"))
    embedding_outputfile = os.path.join(path_to_02_output, "cls_embeddings_Binding.csv")
    for step, batch in enumerate(epoch_iterator):
        if step not in finished_batch["step"].to_list():
            batch = batch.to(device)
            model.eval()
            outputs = model(batch)
            last_hidden_states = outputs[0]  
            tmp_cls_embeddings = last_hidden_states[:, 0, :]
            tmp_cls_embeddings = tmp_cls_embeddings.cpu().detach().numpy()
            pd.DataFrame(tmp_cls_embeddings).to_csv(embedding_outputfile, mode = "a", header = False)
            os.system("echo {} >> {}".format(step, 
                                             os.path.join(path_to_02_output, "finished_batch_cls_embeddings_Binding.txt")))
    
    os.system("touch {}/finished_generating_cls_embeddings_Binding.txt".format(path_to_02_output))


In [11]:
class cls(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(1024, 512)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(512, 256)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(256, 128)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x