In [None]:
!pip install transformers
!pip install tensorflow
!pip install biopython

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')
model = BertForSequenceClassification.from_pretrained('zhihan1996/DNA_bert_6', num_labels=2)

if torch.cuda.is_available():    

    device = torch.device("cuda")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

model.to(device)


In [None]:
 # the kmer splitting function
def Kmers_funct(seq, size=6):
   return [seq[x:x+size].upper() for x in range(len(seq) - size + 1)]

### Load the training data

In [None]:
import Bio.SeqIO as SeqIO
LTR_sequences = [str(rec.seq)[3:-3] for rec in SeqIO.parse("/content/drive/MyDrive/sequences/LTRs_350_700.fasta","fasta")]

non_LTR_sequences = [str(rec.seq) for rec in SeqIO.parse("/content/drive/MyDrive/sequences/non_LTRs_350_700.fasta","fasta")]
labels = [1] * len(LTR_sequences) + [0] * len(non_LTR_sequences)
X = LTR_sequences + non_LTR_sequences
y = labels

In [None]:
def tok_func(x): return " ".join(Kmers_funct(x))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_rem, y_rem, test_size=0.3, random_state=42)

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:
from huggingface_hub import notebook_login
# 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
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33m456438[0m ([33mdiplomovka[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
train_dataset = Dataset(tokenizer([tok_func(x) for x in X_train], padding=True, truncation=True, max_length=512), y_train)
val_dataset = Dataset(tokenizer([tok_func(x) for x in X_val], padding=True, truncation=True, max_length=512), y_val)

In [None]:
wandb.init("Training_BERT_0_350")

https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
from transformers import Trainer, TrainingArguments
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
    push_to_hub=True
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()

In [None]:
# save the trained model to the huggingface hub
trainer.push_to_hub("xhorvat9/LTR_BERT_350_700_noTSD")

In [None]:
wandb.finish()

In [None]:
# Tokenize test data
X_test_tokenized = tokenizer([tok_func(x) for x in X_test], padding=True, truncation=True, max_length=512)
test_trainer = Trainer(model) # Make prediction
raw_pred, _, _ = test_trainer.predict(X_test_tokenized) # Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [None]:
count = 0
for p, t in zip(y_pred, y_test):
  count += int(t == p)
print("Accuracy:", count/len(y_test))

Accuracy: 0.8733110202057767


In [None]:
trainer.save_model("/content/drive/MyDrive/sequences/model")