## Load necessary libraries

In [None]:
!pip install transformers
!pip install tensorflow
!pip install biopython

## Load the models and move to GPU if available

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')
model = BertForSequenceClassification.from_pretrained('xhorvat9/LTR_BERT_0_350_noTSD', num_labels=2)

if torch.cuda.is_available():    

    device = torch.device("cuda")

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

model.to(device)


## Create necessary helper function with overloaded dataset class

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

def Kmers_funct(seq, size=6):
   return [seq[x:x+size].upper() for x in range(len(seq) - size + 1)]
def tok_func(x): return " ".join(Kmers_funct(x))


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

## Load the sequences and draw predictions for sliding window using BERT

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import Bio.SeqIO as SeqIO
from sklearn.model_selection import train_test_split

sequences = [str(rec.seq) for rec in SeqIO.parse("/content/drive/MyDrive/sequences/LTRs_700+.fasta","fasta")]
non_LTR_sequences = [str(rec.seq) for rec in SeqIO.parse("/content/drive/MyDrive/sequences/non_LTRs_700+.fasta","fasta")]
long_sequences = sequences + non_LTR_sequences
labels = np.array([1] * len(sequences) + [0]* len(non_LTR_sequences))

X_train, X_test, y_train, y_test = train_test_split(long_sequences, labels, random_state=42, test_size=0.3)

In [None]:
from transformers import Trainer

window_size = 350
stride = 116 # ~ 1/3 of window size

outputs = []
sequences = []
for seq in X_train:
  seq_windows = []
  for i in range(0, len(seq), stride):
      start = i
      end = i + window_size

      if end > len(seq):
        end = len(seq)
      seq_windows.append(seq[start:end])
  sequences.append(seq_windows)

counter = 0
for s in sequences:
  if counter % 500 == 0 and counter != 0:
    print(f"processing sequence {counter}")
  X_test_tokenized = tokenizer([tok_func(x) for x in s], padding=True, truncation=True, max_length=350) # Create torch dataset
  test_dataset = Dataset(X_test_tokenized) # Load trained model
  test_trainer = Trainer(model) # Make prediction
  output, _,_ = test_trainer.predict(test_dataset) # Preprocess raw predictions
  outputs.append(output)

## Train the CNN on BERT embeddings

In [None]:
import tensorflow as tf
import keras
nn = keras.models.Sequential()
nn.add(keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(45, 2)))
nn.add(keras.layers.MaxPooling1D(pool_size=2))
nn.add(keras.layers.Flatten())
nn.add(keras.layers.Dense(units=1, activation='sigmoid'))

nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
inp = tf.keras.preprocessing.sequence.pad_sequences(outputs, padding="pre", maxlen=45, dtype='float32')

nn.fit(inp, y_train, epochs=15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f7b8a205df0>

### Run on test set

In [None]:
window_size = 350
stride = 116 # ~ 1/3 of window size

outputs = []
sequences = []
for seq in X_test:
  seq_windows = []
  for i in range(0, len(seq), stride):
      start = i
      end = i + window_size

      if end > len(seq):
        end = len(seq)
      seq_windows.append(seq[start:end])
  sequences.append(seq_windows)

counter = 0
for s in sequences:
  if counter % 500 == 0 and counter != 0:
    print(f"processing sequence {counter}")
  X_test_tokenized = tokenizer([tok_func(x) for x in s], padding=True, truncation=True, max_length=350) # Create torch dataset
  test_dataset = Dataset(X_test_tokenized) # Load trained model
  test_trainer = Trainer(model) # Make prediction
  output, _,_ = test_trainer.predict(test_dataset) # Preprocess raw predictions
  outputs.append(output)

In [None]:
i = tf.keras.preprocessing.sequence.pad_sequences(outputs, padding="pre", maxlen=45, dtype='float32')
score = nn.evaluate(i, y_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.4219169020652771
Test accuracy: 0.8043588399887085


In [None]:
nn.save('/content/drive/MyDrive/sequences/BERT_pooling_CNN/TF_CNN_BERT_pool_model')



### Visualize filters to check for influential regions


In [None]:
import tensorflow as tf
new_model = tf.keras.models.load_model('/content/drive/MyDrive/sequences/BERT_pooling_CNN/TF_CNN_BERT_pool_model')

filters, biases = new_model.layers[0].get_weights()

In [None]:
f = abs(filters)
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, filters.shape[-1], figsize=(25, 10))
for i in range(filters.shape[-1]):
    im = axs[i].imshow(f[:, :, i], cmap="gray")
    axs[i].axis('off')
plt.show()