# Project Baselines

## DNABERT

In [None]:
!pip install einops transformers peft omegaconf torch evaluate accelerate

In [54]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
import torch
print(torch.cuda.is_available())

True


In [11]:
dna = "ACGTAGCATCGGATCTATCTATCGACACTTGGTTATCGATCTACGAGCATCTCGTTAGC"
inputs = tokenizer(dna, return_tensors = 'pt')["input_ids"]
print(inputs)
model = model.to("cuda")
hidden_states = model(inputs.to("cuda"))[0] # [1, sequence_length, 768]

# embedding with mean pooling
embedding_mean = torch.mean(hidden_states[0], dim=0)
print(embedding_mean.shape) # expect to be 768

# embedding with max pooling
embedding_max = torch.max(hidden_states[0], dim=0)[0]
print(embedding_max.shape) # expect to be 768

tensor([[2, 1, 3]])
torch.Size([768])
torch.Size([768])


## Nucleotide-Transformer from InstaDeep

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")

# Create a dummy dna sequence and tokenize it
sequences = ['ATTCTG' * 9]
tokens_ids = tokenizer.batch_encode_plus(sequences, return_tensors="pt")["input_ids"]

# Compute the embeddings
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().numpy()
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

# Compute mean embeddings per sequence
mean_sequence_embeddings = torch.sum(attention_mask.unsqueeze(-1)*embeddings, axis=-2)/torch.sum(attention_mask, axis=-1)
print(f"Mean sequence embeddings: {mean_sequence_embeddings}")


Embeddings shape: (1, 10, 1280)
Embeddings per token: [[[ 0.09300065  0.34371722  0.3723031  ... -0.40012842 -0.15076053
    0.07454759]
  [-0.6912435   1.262262   -0.16280103 ... -0.8053381  -0.19705848
   -2.1087687 ]
  [-0.5719655   1.1624311  -0.3457358  ... -0.717227   -0.26854762
   -2.157653  ]
  ...
  [-0.67676777  1.4332684  -0.44941118 ... -0.9559051  -0.67712104
   -2.0696611 ]
  [-0.62161577  1.1120433  -0.14762029 ... -0.9116353  -0.41632363
   -1.9834615 ]
  [-0.902306    1.1265255  -0.30772838 ... -0.97984993 -0.25661114
   -2.2153628 ]]]
Mean sequence embeddings: tensor([[-0.5551,  1.1327, -0.1687,  ..., -0.7917, -0.3625, -1.8504]])


In [4]:
import numpy as np

a = mean_sequence_embeddings[0]
b = mean_sequence_embeddings[0]
cos_sim = np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b))

In [5]:
cos_sim

1.0

In [6]:
from scipy.stats import entropy

p_a_or_t = 0.41
p_g_or_c = 1 - p_a_or_t

probs = []
for n in sequences[0]:
    if n.lower() == "a" or n.lower() == "t":
        probs.append(p_a_or_t)
    else:
        probs.append(p_g_or_c)

print(probs)
print(entropy(probs))

[0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001, 0.41, 0.41, 0.41, 0.5900000000000001, 0.41, 0.5900000000000001]
3.973262012386456


In [7]:
dna_seq = "ATTATGCATCATTGATTT"
dnap_seq = "ATTMHH$TTT"

In [8]:
def entropy_of_sequence(seq: str, p_a_or_t: float = 0.41, p_g_or_c: float = 0.59):
    probs = []
    for n in seq:
        if n.lower() == "a" or n.lower() == "t":
            probs.append(p_a_or_t)
        else:
            probs.append(p_g_or_c)
    return entropy(probs)

In [9]:
entropy_of_sequence(sequences[0])

3.973262012386456

In [10]:
import pandas as pd

probability_table = {
    "A": 0.0777,
    "C": 0.0157,
    "D": 0.0530,
    "E": 0.0656,
    "F": 0.0405,
    "G": 0.0691,
    "H": 0.0227,
    "I": 0.0591,
    "K": 0.0595,
    "L": 0.0960,
    "M": 0.0238,
    "N": 0.0427,
    "P": 0.0469,
    "Q": 0.0393,
    "R": 0.0526,
    "S": 0.0694,
    "T": 0.0550,
    "V": 0.0667,
    "W": 0.0118,
    "Y": 0.0311,
    "M": 0.0001,  # smallest non-zero probability at this level of precision
    "$": 0.0001
}

def entropy_of_dnap_sequence(seq: str, probability_table: dict):
    probs = []
    for n in seq:
        prob = probability_table[n.upper()]
        probs.append(prob)
    return entropy(probs)

In [11]:
sum(probability_table.values())

0.9745999999999999

In [12]:
entropy_of_dnap_sequence(dnap_seq, probability_table)

2.016509206004858

In [13]:
from Bio.Seq import Seq

def dnap_tokenize(seq: str):
    # find boundaries of the coding sequence
    start = seq.lower().find("atg")
    stop = seq.lower().find("tga")
    protein_seq = str(Seq(seq[start:stop]).translate())
    return seq[0:start-1] + protein_seq + "$" + seq[stop+3:]

In [14]:
dnap_tokenize(dna_seq)

'ATMHH$TTT'

In [15]:
df_train = pd.read_csv("./promoter_detection/train.csv", header=0)
df_val = pd.read_csv("./promoter_detection/dev.csv", header=0)
df_test = pd.read_csv("./promoter_detection/test.csv", header=0)

In [16]:
df_train.head()

Unnamed: 0,sequence,label
0,TATAATAATAACGAAGATGAGACGACAGTCGACAAGAAAAGCACCA...,0
1,AAAGCCCGAGCGGCGGCCACGCCTCGGTGGCGATTTTATTAGCGCT...,1
2,AGTCCGCGATATTCTGAGGGGACTTTCGACACAAAAAAGTTGACAC...,0
3,ACCCCCCGGCCCCGCCCCACAGACCCCTCCAGTGGTCCCCCGGCCA...,1
4,AGGTCTTTGGTCCCCCAACCCTGTGCTCTTTCCACTTAAATCCCGA...,1


In [17]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")

tokens_ids = tokenizer.batch_encode_plus(df_train["sequence"][0:100], return_tensors="pt")["input_ids"]

# Compute the embeddings
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().numpy()
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embeddings per token: {embeddings}")

Embeddings shape: (100, 51, 1280)
Embeddings per token: [[[ 7.05086812e-02  2.90539056e-01  3.26251954e-01 ... -5.54491937e-01
   -2.04697158e-02 -1.46248303e-02]
  [-4.15493064e-02  4.68324691e-01 -4.76777941e-01 ... -1.13392413e+00
    1.00057375e+00 -8.60289812e-01]
  [-1.06958918e-01 -5.47006130e-01  5.48749149e-01 ... -1.94588110e-01
   -4.50046688e-01 -1.61181211e+00]
  ...
  [ 6.28693774e-02  7.46243656e-01 -2.83405870e-01 ... -2.47954354e-01
   -2.94601798e-01  1.76721886e-01]
  [-6.48806468e-02  8.48259211e-01 -3.03161860e-01 ... -2.40503669e-01
   -1.08828627e-01 -8.01098585e-01]
  [-5.76559663e-01  6.70304894e-01 -4.45232362e-01 ...  1.14153378e-01
    3.67443502e-01 -2.37486780e-01]]

 [[ 1.15056746e-01  2.23919272e-01  5.36945701e-01 ... -6.04434431e-01
   -2.54429616e-02  1.77233116e-04]
  [-1.32668555e-01  7.65201211e-01 -6.59714341e-01 ... -6.14052534e-01
    9.20189738e-01 -1.16649196e-02]
  [ 2.50105560e-01  6.90061927e-01 -6.83128417e-01 ...  4.29264992e-01
    9.431

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=10, learning_rate=1.0)
clf = clf.fit(embeddings.reshape((100, 51 * 1280)), df_train['label'][0:100])

In [22]:
tokens_ids = tokenizer.batch_encode_plus(df_test["sequence"][0:100], return_tensors="pt")["input_ids"]

# Compute the embeddings
attention_mask = tokens_ids != tokenizer.pad_token_id
torch_outs = model(
    tokens_ids,
    attention_mask=attention_mask,
    encoder_attention_mask=attention_mask,
    output_hidden_states=True
)

# Compute sequences embeddings
embeddings = torch_outs['hidden_states'][-1].detach().numpy()

In [24]:
clf.score(embeddings.reshape((100, 51 * 1280)), df_test['label'][0:100])

0.52

In [42]:
embeddings.shape

(100, 51, 1280)

In [110]:
from torch import nn
from torch.nn import functional as F


class ClassifierHead(nn.Module):
    
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.fc1 = nn.Linear(1280, 51)
        self.fc2 = nn.Linear(2601, 1)

    def forward(self, x: torch.Tensor):
        x = self.fc1(x)
        x = F.relu(x).flatten(start_dim=1)
        x = self.fc2(x)
        return F.sigmoid(x)

clf = ClassifierHead()

In [125]:
from typing import Callable
from torch.utils.data import DataLoader


def train_closure(model: nn.Module,
                  optimizer: torch.optim.Optimizer,
                  loss_fn: Callable,
                  train_data_loader: DataLoader,
                  labels: torch.Tensor):

    def train(epoch: int):
        batch_offset_idx = 0
        for i, data in enumerate(train_data_loader):
            optimizer.zero_grad()  # zero gradients for the current batch
            print(data.shape)
            outputs = model(data)
            print(outputs.shape)
            batch_right_offset = batch_offset_idx + train_data_loader.batch_size
            if i == 10:
                break
            loss = loss_fn(outputs, labels[batch_offset_idx: batch_right_offset])
            batch_offset_idx = batch_right_offset
            optimizer.step()  # triggers weight update
        print(f"Epoch: {epoch}, Loss: {loss}")

    return train


In [126]:
from torch.utils.data import Dataset

class SequenceDataLoader(Dataset):

    def __init__(self, sequences: pd.Series, batch_size: int = 10) -> None:
        self.x = torch.tensor(sequences)
        self.batch_size = batch_size
        super().__init__()

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index: index + self.batch_size]

In [127]:
train_fn = train_closure(clf,
                         torch.optim.SGD(clf.parameters(), lr=0.1),
                         nn.BCELoss(),
                         SequenceDataLoader(embeddings),
                         torch.tensor(df_train["label"][0:100]).unsqueeze(-1).float())

In [128]:
epochs = 5

for epoch in range(epochs):
    train_fn(epoch)

torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
Epoch: 0, Loss: 0.7219427824020386
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280])
torch.Size([10, 1])
torch.Size([10, 51, 1280]

: 