# Sequence Generation and Alignment Analysis with Evo2
This notebook demonstrates how to generate biological sequences using the Evo2 model api and analyze them using Biopython alignments.


In [1]:
import os
import argparse
import csv
from pathlib import Path
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn.functional as F
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq
import requests

# Set random seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)





## Model Initialization
Let's initialize our Evo2 model. We'll use the 7B parameter version as a default.

In [2]:
NVIDIA_API_KEY = os.getenv("NVCF_RUN_KEY") or os.getenv("NVIDIA_API_KEY")
BASE_URL = "https://health.api.nvidia.com/v1/biology"

if NVIDIA_API_KEY is None:
    raise RuntimeError(
        "Missing NVIDIA API key. Set env var NVCF_RUN_KEY or NVIDIA_API_KEY."
    )

HEADERS = {
    "Authorization": f"Bearer {NVIDIA_API_KEY}",
    "Content-Type": "application/json",
}

def evo2_generate(
    sequence,
    num_tokens=100,
    temperature=0.7,
    top_k=1,
    top_p=0.0,
    seed=None,
    enable_sampled_probs=False,
    enable_logits=False,
    model="evo2-7b"
):
    """Call Evo2 / generate to extend a DNA seed."""
    payload = {
        "sequence": sequence,
        "num_tokens": int(num_tokens),
        "temperature": float(temperature),
        "top_k": int(top_k),
        "top_p": float(top_p),
        "enable_sampled_probs": bool(enable_sampled_probs),
        "enable_elapsed_ms_per_token": False,
        "enable_logits": bool(enable_logits),
    }
    if seed is not None:
        payload["random_seed"] = int(seed)

    url = f"{BASE_URL}/arc/{model}/generate"
    r = requests.post(url, headers=HEADERS, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()

## Data Loading
Next we'll create functions to load our example sequences


In [None]:
def read_sequences(input_file: Path) -> Tuple[List[str], List[str]]:
    """
    Read input and target sequences from CSV file.
    
    Expected CSV format:
    input_sequence,target_sequence
    ACGTACGT,ACGTACGTAA
    ...
    """
    input_seqs: List[str] = []
    names: List[str] = []
    
    with open(input_file, encoding='utf-8-sig', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            input_seqs.append(row[0])
            if len(row) > 1:
                names.append(row[1])
    
    return input_seqs, names

# Load example data

sequences, names = read_sequences('prompts.csv')

# For 'autocomplete', we split the data into input and target sequences
input_seqs = [seq[:500] for seq in sequences]
target_seqs = [seq[500:1000] for seq in sequences]

print(f"Loaded {len(sequences)} sequence pairs")

Loaded 4 sequence pairs


In [4]:
[print(seq[:100]) for seq in sequences]

GAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGACGGTGATTTCTGCATTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGA
GACACCATCGAATGGCGCAAAACCTTTCGCGGTATGGCATGATAGCGCCCGGAAGAGAGTCAATTCAGGGTGGTGAATGTGAAACCAGTAACGTTATACG
GTTAATGTAGCTTAAAACAAAAGCAAGGTACTGAAAATACCTAGACGAGTATATCCAACTCCATAAACAACAAAGGTTTGGTCCCGGCCTTCTTATTGGT
GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTG


[None, None, None, None]

### Now it's time to generate!

In [5]:
generated_seqs = []
for seq in input_seqs:
    out = evo2_generate(seq, model="evo2-7b", num_tokens=500, seed=123, enable_sampled_probs=False, enable_logits=False)
    print("Generated sequence:", out["sequence"])
    generated_seqs.append(out["sequence"])

Generated sequence: AGGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGGGCAGGGCACAGACAAACAAAAAGACAGCAGTAACCTCTGCAGACTTAAATGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCCAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGGTCCTGTCTGTTAGAAGGAAAACTAACAAACAGAAAGGACATCCACACCAAAAACCCATCTGTACATCACCATCATCAAAGACCAAAAGTAGATAAAACCACAAAGATGGGGAAAAAACAGAACAGAAAAACTGGAAACTCTAAAAAGCAGAGCGCCTCTCCTCCTCCAAAGGAACGCAGTTCCTCACCAGCAACGGAACAAAGCTGGATGGAGAATGACTT
Generated sequence: TAATGTTCCGGCGTTATTTCTTGATGTCTCTGACCAGACACCCATCAACAGTATTATTTTCTCCCATGAAGACGGTACGCGACTGGGCGTGGAGCATCTGGTCGCATTGGGTCACCAGCAAATCGCGCTGTTAGCGGGGCCATTAAGTTCTGTCTCGGCGCGTCTGCGTCTGGCGGGCTGGCATAAATATCTCACTCGCAATCAAATTCAGCCGATAGCGGTACGGGAAGGCGACTGGAGTGCCATGTCCGGTTATCAACAAACCATGGAAATGCTGAATAAGGGCATCGTTCCTTCTGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCAATTGAAGAGTCCGGGCTGCGCGTTCCGGAGGATATCTCGGTGGTGGGATATGACGATACCGAAGACAGCTCGTGTTTTATTCCGCCGTTAACCACCGTCAAACAGGATTTTCG

In [6]:
def analyze_alignments(generated_seqs: List[str],
                       target_seqs: List[str],
                       names: Optional[List[str]] = None
                      ) -> List[dict]:
    """
    Analyze and visualize alignments between generated and target sequences.
    
    Args:
        generated_seqs: List of generated sequences
        target_seqs: List of target sequences
        names: Optional list of sequence names
        
    Returns:
        List of alignment metrics for each sequence pair
    """
    metrics = []
    print("\nSequence Alignments:")
    
    for i, (gen_seq, target_seq) in enumerate(zip(generated_seqs, target_seqs)):
        if names and i < len(names):
            print(f"\nAlignment {i+1} ({names[i]}):")
        else:
            print(f"\nAlignment {i+1}:")
        
        gen_bio_seq = Seq(gen_seq)
        target_bio_seq = Seq(target_seq)
        
        # Get alignments
        alignments = pairwise2.align.globalms(
            gen_bio_seq, target_bio_seq,
            match=2,
            mismatch=-1,
            open=-0.5,
            extend=-0.1
        )
        
        best_alignment = alignments[0]
        print(format_alignment(*best_alignment))
        
        matches = sum(a == b for a, b in zip(best_alignment[0], best_alignment[1]) 
                      if a != '-' and b != '-')
        alignment_length = len(best_alignment[0].replace('-', ''))
        similarity = (matches / len(target_seq)) * 100
        
        seq_metrics = {
            'similarity': similarity,
            'score': best_alignment[2],
            'length': len(target_seq),
            'gaps': best_alignment[0].count('-') + best_alignment[1].count('-')
        }
        
        if names and i < len(names):
            seq_metrics['name'] = names[i]
            
        metrics.append(seq_metrics)
        
        print(f"Sequence similarity: {similarity:.2f}%")
        print(f"Alignment score: {best_alignment[2]:.2f}")
    
    return metrics

# Analyze alignments
alignment_metrics = analyze_alignments(generated_seqs, target_seqs, names)


Sequence Alignments:

Alignment 1 (L1RE2):
AGGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGGGCAGGGCACAGACAAACAAAAAGA-CAGCAGTAACCTCTGCAGACTTAAA-TGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCCAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGGTCCTGTCTGTTAGAAGGAAAACTAACAAAC-AGAAAGGACATCC-ACACCA-AAAACCCATCTGTACATCACCATCATCAAAGACCAAAAGTAGATAAAACCACAAAGATGGGGAAAAAACAGAACAGAAAAACTGGAAACTCTAAAAA-GCAGAGCGCCTCTCCTCCTCCAAAGGAACGCAGTTCCTCACCAGCAACG-GAACAAAGCTGGATGGAGAATGACTT-
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  |||||||||||||||||||||| || ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||| ||||||||||| | |||||  |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| |||| ||||||||||||||||

## Generate with species prompt

In [7]:
# copy and pasted from utils.py
def make_phylotag_from_gbif(
        species_name: str,
) -> dict:
    """
    Returns phylogenetic tags for a given species, to get new tags not in the metadata
    """

    import requests
    def get_taxonomy_from_gbif(species_name):
        url = f"https://api.gbif.org/v1/species/match?name={species_name}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            return {
                "kingdom": data.get("kingdom"),
                "phylum": data.get("phylum"),
                "class": data.get("class"),
                "order": data.get("order"),
                "family": data.get("family"),
                "genus": data.get("genus"),
                "species": data.get("species")
            }
        else:
            print(f"Could not find taxonomy for {species_name}")

    taxonomy = get_taxonomy_from_gbif(species_name)
    if taxonomy:
        phylo_tag = (
        f'd__{taxonomy["kingdom"]};'
        f'p__{taxonomy["phylum"]};'
        f'c__{taxonomy["class"]};'
        f'o__{taxonomy["order"]};'
        f'f__{taxonomy["family"]};'
        f'g__{taxonomy["genus"]};'
        f's__{taxonomy["species"]}'
    ).upper()
        phylo_tag = '|'+phylo_tag+'|'
    else:
        print(f"Could not find taxonomy for {species_name}")

    return phylo_tag.upper()



In [8]:
species = 'Phascolarctos cinereus' # Koala bear

# Generate species tag prompt
species_tag_prompt = make_phylotag_from_gbif(species)

print(f"Species tag prompt: {species_tag_prompt}") # Check if the GBIF API returned a valid species tag!

# Generate species sequence
koala_sequence = evo2_generate(species_tag_prompt, model="evo2-40b", num_tokens=500, temperature=1, seed=123, enable_sampled_probs=False, enable_logits=False)

print("Generated koala sequence:", koala_sequence["sequence"])

Species tag prompt: |D__ANIMALIA;P__CHORDATA;C__MAMMALIA;O__DIPROTODONTIA;F__PHASCOLARCTIDAE;G__PHASCOLARCTOS;S__PHASCOLARCTOS CINEREUS|
Generated koala sequence: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
