# EVO2 Predictions on Genome FASTA

This notebook reads FASTA file(s) in the `data/` directory, prepares sequences with lengths from 128 up to 1,048,576 (doubling each step), sends each sequence to the EVO2 model to generate 128 characters, and compares the generated 128 characters to the next 128 characters from the FASTA genome.

Notes:
- Set the `NVCF_RUN_KEY` environment variable in your container or paste it when prompted.
- The example uses the NIM HTTP endpoint; be mindful of rate limits and cost.
- If the FASTA genome is shorter than required sizes, the genome will be repeated (wrap) to create the required length for testing.

In [7]:
# Imports and configuration
import os
import time
import math
import csv
import requests
from pathlib import Path
from Bio import SeqIO

# API configuration
KEY = os.getenv("NVCF_RUN_KEY") or input("Paste the Run Key (NVCF_RUN_KEY): ")
URL = os.getenv("URL", "https://health.api.nvidia.com/v1/biology/arc/evo2-40b/generate")
HEADERS = {"Authorization": f"Bearer {KEY}"}

# Paths
WORKSPACE = Path('/workspace') if Path('/workspace').exists() else Path('.')
DATA_DIR = WORKSPACE / 'data'
RESULTS_DIR = WORKSPACE / 'results'
RESULTS_DIR.mkdir(exist_ok=True)

# Print to confirm
print("NIM_EVO2_TIMEOUT_S in notebook:", os.environ.get("NIM_EVO2_TIMEOUT_S"))
os.environ["NIM_EVO2_TIMEOUT_S"] = "600"
print("NIM_EVO2_TIMEOUT_S in notebook:", os.environ.get("NIM_EVO2_TIMEOUT_S"))


NIM_EVO2_TIMEOUT_S in notebook: 600
NIM_EVO2_TIMEOUT_S in notebook: 600


In [8]:
# Load FASTA(s) from data directory and concatenate all sequences into a single genome string
fasta_paths = list(DATA_DIR.glob('*.fasta')) + list(DATA_DIR.glob('*.fa'))
if not fasta_paths:
    raise FileNotFoundError(f'No FASTA files found in {DATA_DIR.resolve()}')

records = []
for p in fasta_paths:
    for rec in SeqIO.parse(str(p), 'fasta'):
        records.append(str(rec.seq))

genome = ''.join(records).upper()
print(f'Loaded genome length: {len(genome)} from {len(fasta_paths)} fasta file(s)')

Loaded genome length: 119668634 from 1 fasta file(s)


In [9]:
# Helper: call EVO2 model via NIM HTTP endpoint
import json

def call_evo2(sequence, num_tokens=128, top_k=1, enable_sampled_probs=True, url=URL, headers=HEADERS, timeout=120):
    """Call the EVO2 NIM HTTP endpoint with extra debug logging on errors.

    Returns parsed JSON on 2xx responses, otherwise prints diagnostics and returns None.
    """
    payload = {
        'sequence': sequence,
        'num_tokens': num_tokens,
        'top_k': top_k,
        'enable_sampled_probs': enable_sampled_probs,
    }
    try:
        print(f"[call_evo2] seq_len={len(sequence)}, num_tokens={num_tokens}, payload_keys={list(payload.keys())}")
        r = requests.post(url, headers=headers, json=payload, timeout=timeout)
        # If non-2xx, show status and body for debugging
        if not (200 <= r.status_code < 300):
            body = r.text
            printed = body[:2000] if isinstance(body, str) else str(body)
            print(f"[call_evo2] HTTP {r.status_code} response body (truncated 2000 chars):\n{printed}")
            # raise to let caller handle as well
            r.raise_for_status()
        # parse json if possible
        try:
            return r.json()
        except ValueError:
            # not JSON; return text under a key so caller can still inspect
            print('[call_evo2] Response not JSON; returning text in dict')
            return {'text': r.text}
    except requests.exceptions.RequestException as e:
        print('Request error:', e)
        return None

# Helper: compare predicted string to truth string
def compare_sequences(pred, truth):
    # normalize and remove literal newline characters using chr(10) to avoid JSON escaping issues
    pred = (pred or '').strip().replace(chr(10), '')
    truth = (truth or '').strip().replace(chr(10), '')
    n = min(len(pred), len(truth))
    if n == 0:
        return {'length': len(pred), 'matches': 0, 'accuracy': 0.0}
    matches = sum(1 for i in range(n) if pred[i] == truth[i])
    return {'length': n, 'matches': matches, 'accuracy': matches / n}


In [10]:
# Chunking helper
from typing import Iterator

def chunk_sequence(seq: str, chunk_size: int) -> Iterator[str]:
    """Yield successive chunks of `seq` with size <= chunk_size.

    Example:
        list(chunk_sequence('ABCDEFG', 3)) -> ['ABC', 'DEF', 'G']
    """
    if chunk_size <= 0:
        raise ValueError('chunk_size must be > 0')
    for i in range(0, len(seq), chunk_size):
        yield seq[i:i+chunk_size]

# Quick non-network test helper
def _debug_show_chunks(seq: str, chunk_size: int):
    ch = list(chunk_sequence(seq, chunk_size))
    print('num_chunks=', len(ch))
    print('chunk lengths=', [len(c) for c in ch])
    return ch


In [11]:
# Encapsulated runner with explicit chunking and retries
import itertools

def run_evo2_tests(max_chunk=2048, sizes=None, start_index=0, out_csv_path=None, sleep_between=1, max_retries=2):
    """Run EVO2 predictions for sizes and handle chunking/retries.

    - max_chunk: chunk size to split input into when the test size exceeds this value.
    - sizes: list of sizes to test (if None, uses powers of two from 128..1,048,576).
    - start_index: offset into the genome to start each test.
    - out_csv_path: Path object or string for results CSV (default uses RESULTS_DIR).
    - sleep_between: seconds to sleep between requests (default 1s).
    - max_retries: number of tries per chunk/request before giving up.
    """
    if sizes is None:
        sizes = []
        L = 128
        while L <= 1048576:
            sizes.append(L)
            L *= 2
    print(f"Running tests for sizes: {sizes}")

    if out_csv_path is None:
        out_csv_path = RESULTS_DIR / 'evo2_predictions_results.csv'
    else:
        out_csv_path = Path(out_csv_path)

    # prepare CSV
    with open(out_csv_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
        writer.writeheader()

    def safe_call(seq, num_tokens=128):
        """Call EVO2 with retries."""
        last_exc = None
        for attempt in range(1, max_retries+1):
            try:
                resp = call_evo2(seq, num_tokens=num_tokens)
                if resp is None:
                    raise RuntimeError('Empty/None response')
                return resp
            except Exception as e:
                print(f"  Attempt {attempt}/{max_retries} failed: {e}")
                last_exc = e
                time.sleep(0.5 * attempt)
        print("  All retries failed for this chunk.")
        raise last_exc

    for size in sizes:
        needed = size + 128
        if len(genome) < needed:
            repeats = (needed // len(genome)) + 2
            biggenome = (genome * repeats)
        else:
            biggenome = genome

        seq_to_send = biggenome[start_index:start_index + size]
        truth_seq = biggenome[start_index + size:start_index + size + 128]

        print(f'--- Size {size}: sending sequence length {len(seq_to_send)} to model')

        pred = ''
        note = ''
        # chunking logic
        if len(seq_to_send) >= max_chunk:
            chunks = list(chunk_sequence(seq_to_send, max_chunk))
            print(f'Chunking enabled. {len(chunks)} chunks; chunk_size={max_chunk}; chunk lengths: {[len(c) for c in chunks]}')
            generated_full = ''
            try:
                for i, chunk in enumerate(chunks):
                    print(f'  -> Sending chunk {i+1}/{len(chunks)} (len={len(chunk)})')
                    resp = safe_call(chunk, num_tokens=128)
                    # extract string
                    generated = ''
                    if isinstance(resp, dict):
                        for key in ('sequence','generated_sequence','generated','text'):
                            if key in resp and isinstance(resp[key], str):
                                generated = resp[key]
                                break
                        if not generated:
                            def find_string(d):
                                if isinstance(d, str):
                                    return d
                                if isinstance(d, dict):
                                    for v in d.values():
                                        s = find_string(v)
                                        if s:
                                            return s
                                if isinstance(d, list):
                                    for v in d:
                                        s = find_string(v)
                                        if s:
                                            return s
                                return None
                            found = find_string(resp)
                            if found:
                                generated = found
                    part = (generated or '').replace(chr(10), '')
                    print(f'    received {len(part)} generated chars from chunk {i+1}')
                    generated_full += part
                    time.sleep(sleep_between)
                pred = generated_full[:128]
            except Exception as e:
                note = 'chunk_failed'
                print('Chunked requests failed:', e)
                with open(out_csv_path, 'a', newline='') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
                    writer.writerow({'size_sent': size, 'pred_len': 0, 'matches': 0, 'accuracy': 0.0, 'note': note})
                continue
        else:
            print('No chunking needed for this size')
            try:
                resp = safe_call(seq_to_send, num_tokens=128)
            except Exception as e:
                note = 'request_failed'
                print('Request failed:', e)
                with open(out_csv_path, 'a', newline='') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
                    writer.writerow({'size_sent': size, 'pred_len': 0, 'matches': 0, 'accuracy': 0.0, 'note': note})
                continue
            generated = ''
            if isinstance(resp, dict):
                for key in ('sequence','generated_sequence','generated','text'):
                    if key in resp and isinstance(resp[key], str):
                        generated = resp[key]
                        break
                if not generated:
                    def find_string(d):
                        if isinstance(d, str):
                            return d
                        if isinstance(d, dict):
                            for v in d.values():
                                s = find_string(v)
                                if s:
                                    return s
                        if isinstance(d, list):
                            for v in d:
                                s = find_string(v)
                                if s:
                                    return s
                        return None
                    found = find_string(resp)
                    if found:
                        generated = found
            pred = (generated or '').replace(chr(10), '')[:128]

        comp = compare_sequences(pred, truth_seq)
        print("Size {}: pred_len={}, matches={}, accuracy={:.4f}".format(size, comp['length'], comp['matches'], comp['accuracy']))

        with open(out_csv_path, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
            writer.writerow({'size_sent': size, 'pred_len': comp['length'], 'matches': comp['matches'], 'accuracy': comp['accuracy'], 'note': note})

    print('\nAll tests complete. Results saved to', out_csv_path)

# To run the tests, call: run_evo2_tests(max_chunk=2048)


In [12]:
# Prepare sizes: start at 128 and double until 1,048,576 (2^20)
sizes = []
L = 256
while L <= 1048576:
    sizes.append(L)
    L *= 2
print('Sizes to test:', sizes)

# Main loop: for each size, take the first `size` bases, ask model to generate 128 bases, compare to next 128 bases
start_index = 0  # you can modify this to sample different positions
OUT_CSV = RESULTS_DIR / 'evo2_predictions_results.csv'

# CSV header
with open(OUT_CSV, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
    writer.writeheader()

for size in sizes:
    needed = size + 128  # we need the sequence and the following 128 bases to compare against
    if len(genome) < needed:
        # repeat (wrap) the genome to make enough length
        repeats = (needed // len(genome)) + 2
        biggenome = (genome * repeats)
    else:
        biggenome = genome

    seq_to_send = biggenome[start_index:start_index + size]
    truth_seq = biggenome[start_index + size:start_index + size + 128]

    print(f'--- Size {size}: sending sequence length {len(seq_to_send)} to model')

    # Call EVO2 - be mindful: large payloads can be slow or restricted by the service
    resp = call_evo2(seq_to_send, num_tokens=128)
    if resp is None:
        note = 'request_failed'
        print('Request failed, stopping further tests.')
        with open(OUT_CSV, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
            writer.writerow({'size_sent': size, 'pred_len': 0, 'matches': 0, 'accuracy': 0.0, 'note': note})
        break

    # The response format may vary. Typical key could be 'sequence' or similar. Handle common variants.
    generated = ''
    if isinstance(resp, dict):
        # try common keys
        for key in ('sequence','generated_sequence','generated','text'):
            if key in resp and isinstance(resp[key], str):
                generated = resp[key]
                break
        # If the API returns nested structures, try to extract first string occurrence
        if not generated:
            # try to find a string value in the JSON response
            def find_string(d):
                if isinstance(d, str):
                    return d
                if isinstance(d, dict):
                    for v in d.values():
                        s = find_string(v)
                        if s:
                            return s
                if isinstance(d, list):
                    for v in d:
                        s = find_string(v)
                        if s:
                            return s
                return None
            found = find_string(resp)
            if found:
                generated = found

    generated = (generated or '').replace(chr(10), '')
    pred = generated[:128]  # only compare the first 128 generated characters

    #print("Predicted: ",pred)
    #print("Truth:     ",truth_seq)

    comp = compare_sequences(pred, truth_seq)
    print("Size {}: pred_len={}, matches={}, accuracy={:.4f}".format(size, comp['length'], comp['matches'], comp['accuracy']))

    with open(OUT_CSV, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['size_sent','pred_len','matches','accuracy','note'])
        writer.writerow({'size_sent': size, 'pred_len': comp['length'], 'matches': comp['matches'], 'accuracy': comp['accuracy'], 'note': ''})

    # Gentle delay to avoid rate limits (adjust as needed)
    time.sleep(1)

print('\nAll tests complete. Results saved to', OUT_CSV)

test_seq = genome[:4096]
chunks = list(chunk_sequence(test_seq, 2048))
print('num_chunks=', len(chunks))
print('chunk lengths=', [len(c) for c in chunks])

Sizes to test: [256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576]
--- Size 256: sending sequence length 256 to model
[call_evo2] seq_len=256, num_tokens=128, payload_keys=['sequence', 'num_tokens', 'top_k', 'enable_sampled_probs']
Size 256: pred_len=128, matches=56, accuracy=0.4375
--- Size 512: sending sequence length 512 to model
[call_evo2] seq_len=512, num_tokens=128, payload_keys=['sequence', 'num_tokens', 'top_k', 'enable_sampled_probs']
Size 512: pred_len=128, matches=47, accuracy=0.3672
--- Size 1024: sending sequence length 1024 to model
[call_evo2] seq_len=1024, num_tokens=128, payload_keys=['sequence', 'num_tokens', 'top_k', 'enable_sampled_probs']
Size 1024: pred_len=128, matches=43, accuracy=0.3359
--- Size 2048: sending sequence length 2048 to model
[call_evo2] seq_len=2048, num_tokens=128, payload_keys=['sequence', 'num_tokens', 'top_k', 'enable_sampled_probs']
Size 2048: pred_len=128, matches=62, accuracy=0.4844
--- Size 4096: sendi

### Notes & Next steps
- The service may enforce limits on request size; if you get errors for large `size`, consider smaller maximum sizes or chunking strategies.
- You can randomize `start_index` to sample different genome locations.
- For production scale tests, add robust retry/backoff and monitor API quotas/costs.