`Viru`, `Temp`, `Bact` 

In [None]:
from Bio import SeqIO

from Bio import SeqIO

def parse_labeled_fasta(fasta_path):
    data = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = str(record.seq)
        if record.id.startswith("Viru"):
            label = "Viru"
        elif record.id.startswith("Temp"):
            label = "Temp"
        elif record.id.startswith("Bact"):
            label = "Bact"
        else:
            continue  # Skip unknown
        data.append({"sequence": seq, "label": label})
    return data




In [None]:
from evo import Evo

evo_model = Evo('evo-1-131k-base')
model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device)
model.eval()

In [None]:
from torch.utils.data import Dataset

class GenomicDataset(Dataset):
    def __init__(self, data, tokenizer, label2id):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.data[idx]["sequence"]
        label = self.label2id[self.data[idx]["label"]]
        tokens = self.tokenizer(seq, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "labels": label
        }

In [None]:
class EvoClassifier(nn.Module):
    def __init__(self, base_model_name, num_classes=3):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # CLS token
        return self.classifier(pooled)


In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


In [None]:
# Load Evo tokenizer and model
model_name = "evo-1-131k-base"  # Update if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EvoClassifier(base_model_name=model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load and split data
all_data = parse_labeled_fasta("datasets/fusion_sequences.fasta")
random.shuffle(all_data)
split_idx = int(0.8 * len(all_data))
train_data = all_data[:split_idx]
val_data = all_data[split_idx:]

train_dataset = GenomicDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training loop
for epoch in range(3):
    loss = train(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
