In [4]:
!pip install biopython

from Bio import Entrez, SeqIO
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments,Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import torch
from torch.utils.data import Dataset



In [2]:
Entrez.email = "jacquelinekgrimm@gmail.com"

# Get gene sequences from NCBI using accession number
def get_genes(accession, max_genes):
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    genes = {}
    genes_count = 0
    for feature in record.features:
        if genes_count >= max_genes:
            break
        if feature.type == "CDS":
            gene_name = ''
            if 'gene' in feature.qualifiers:
                gene_name = feature.qualifiers['gene'][0]
            elif 'locus_tag' in feature.qualifiers:
                gene_name = feature.qualifiers['locus_tag'][0]
            else:
                gene_name = f"Unknown_gene_{len(genes)+1}"
            genes[gene_name] = str(feature.location.extract(record).seq)
            genes_count += 1
    return genes

# Get Salmonella and Bacillus genes
salmonella_genes = get_genes("AL513382", max_genes=500)
bacillus_genes = get_genes("AE016877", max_genes=500)

In [3]:
# Function to create k-mers
def make_kmers(seq, size):
    return [seq[x:x + size].lower() for x in range(len(seq) - size + 1)]

# Function to join k-mer words into sentences
def sentences(genes_dict, kmer_size):
    gene_sentences = {}
    for gene_name, sequence in genes_dict.items():
        words = make_kmers(sequence, size=kmer_size)
        joined_sentence = ' '.join(words)
        gene_sentences[gene_name] = joined_sentence
    return gene_sentences

# Creating sentences
salmonella_sentences = sentences(salmonella_genes, kmer_size=6)
bacillus_sentences = sentences(bacillus_genes, kmer_size=6)

In [5]:
# Create DataFrames
salmonella_df = pd.DataFrame({'Sentences': list(salmonella_sentences.values()), 'Species': 0})
bacillus_df = pd.DataFrame({'Sentences': list(bacillus_sentences.values()), 'Species': 1})

# Concatenate DataFrames
df = pd.concat([salmonella_df, bacillus_df], ignore_index=True)

In [6]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Sentences'].tolist(),
                                                                    df['Species'].tolist(),
                                                                    test_size=0.2,
                                                                    random_state=42)

In [8]:
# Load the model and tokenizet
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Convert labels to tensors
train_labels = train_labels.clone().detach()
val_labels = val_labels.clone().detach()

In [15]:
# Convert tokenized encodings to a dataset
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
)

# Define the Trainer object
compute_metrics = lambda eval_pred: {'accuracy': (torch.tensor(eval_pred.predictions).argmax(-1) == torch.tensor(eval_pred.label_ids)).float().mean().item()}
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [17]:
# Train the model
trainer.train()

  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.245062,0.89
2,No log,0.236481,0.91
3,No log,0.182131,0.92


Checkpoint destination directory ./results/checkpoint-25 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.
  item['labels'] = torch.tensor(self.labels[idx])
Checkpoint destination directory ./results/checkpoint-75 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=75, training_loss=0.24059819539388022, metrics={'train_runtime': 144.4706, 'train_samples_per_second': 16.55, 'train_steps_per_second': 0.519, 'total_flos': 629098533365760.0, 'train_loss': 0.24059819539388022, 'epoch': 3.0})