In [2]:
from pathlib import Path
import pandas as pd

virus = 'Arena'

# Define the folder and file name
folder = Path("/Users/ishaharris/Projects/ribolings/data/virus/codes/rna")  # adjust to your folder
file = f"{virus}_codes.csv"

# Build the full path using /
file_path = folder / file

# Read CSV into a pandas DataFrame
df = pd.read_csv(file_path)

# Show first few rows
print(df.head())


KeyboardInterrupt: 

In [None]:
print(file_path)

Users/ishaharris/Projects/ribolings/data/virus/codes/rna/Arena_codes.csv


In [None]:
# === Retrieve GenBank CDS FASTAs by accession (hard-coded API key + robust skip) ===
from __future__ import annotations

import time
from pathlib import Path
from typing import List, Optional, Tuple

import pandas as pd
from Bio import Entrez

# --------------- CONFIG (edit these) ---------------
# REQUIRED: set your real email and API key (hard-coded here)
ENTREZ_EMAIL   = "inah2@cam.ac.uk"
ENTREZ_API_KEY = None
#ENTREZ_API_KEY = "fe0657db7c3e1518b265167f026e11288a07"   # <-- put your key here

# Input directory with CSVs like "Sedereo_codes.csv" containing a column named "Codes"
INPUT_DIR  = Path("/Users/ishaharris/Projects/ribolings/data/virus/codes/dna")

# Output directory for combined FASTAs + failed accession logs
OUTPUT_DIR = Path("/Users/ishaharris/Projects/ribolings/data/virus/dna_cds_fasta/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Set to a list (e.g., ["Sedereo","Birna"]) to restrict; or leave as None to auto-discover all *_codes.csv
VIRUS_FAMILIES: Optional[List[str]] = None

# Gentle rate-limiting between NCBI calls (seconds). With API key, you can go to ~0.1s safely.
SLEEP = 0.2
# ------------- /CONFIG ----------------

# Apply Entrez credentials
Entrez.email = ENTREZ_EMAIL
Entrez.api_key = ENTREZ_API_KEY

# ---------- Helpers ----------
def fasta_record_count(path: Path) -> int:
    """Count records in an existing FASTA (lines starting with '>'). Returns 0 if file missing or empty."""
    if not path.exists() or path.stat().st_size == 0:
        return 0
    try:
        cnt = 0
        with path.open("r") as f:
            for line in f:
                if line.startswith(">"):
                    cnt += 1
        return cnt
    except Exception:
        return 0

def entrez_search_nuccore(accession: str) -> List[str]:
    """Return a list of nuccore IDs for the accession (may be empty). Uses XML (default) so Entrez.read works."""
    query = f"{accession}[accession]"
    with Entrez.esearch(db="nuccore", term=query) as handle:   # XML by default
        data = Entrez.read(handle)                              # parse XML
    return data.get("IdList", []) or []

def entrez_fetch_fasta_cds_na(nuccore_id: str) -> str:
    """Fetch FASTA (CDS nucleotides) text for a single nuccore id."""
    with Entrez.efetch(db="nuccore", id=nuccore_id, rettype="fasta_cds_na", retmode="text") as handle:
        return handle.read()

def retrieve_cds_sequences(accession: str) -> Optional[List[str]]:
    """
    Search by accession; fetch rettype=fasta_cds_na for the first hit.
    Returns list of 'header\\nsequence' chunks (without leading '>'), or None if not found.
    """
    ids = entrez_search_nuccore(accession)
    time.sleep(SLEEP)
    if not ids:
        return None
    blob = entrez_fetch_fasta_cds_na(ids[0])
    time.sleep(SLEEP)
    # Split on '>' and remove internal newlines (parity with R's gsub)
    parts = [p.replace("\n", "") for p in blob.split(">") if p.strip()]
    return parts or None

def process_fasta_chunks(fasta_chunks: List[str]) -> Tuple[List[str], List[str]]:
    """
    Each chunk looks like: "<header fields...>[gbkey=CDS]<sequence>"
    We split on "[gbkey=CDS]" and reconstruct:
      metadata = "<header fields...>[gbkey=CDS]"
      nucleotides = "<sequence>"
    """
    metas, seqs = [], []
    for chunk in fasta_chunks:
        parts = chunk.split("[gbkey=CDS]")
        if len(parts) < 2:
            continue
        meta = (parts[0] + "[gbkey=CDS]").strip()
        seq  = parts[1].strip().replace(" ", "")
        if seq:
            metas.append(meta)
            seqs.append(seq)
    return metas, seqs

def write_fasta(headers: List[str], seqs: List[str], out_path: Path, line_width: int = 80) -> None:
    """Write a multi-record FASTA; each header goes after '>'."""
    assert len(headers) == len(seqs)
    def wrap(s: str, w: int) -> List[str]:
        return [s[i:i+w] for i in range(0, len(s), w)]
    with out_path.open("w") as f:
        for h, s in zip(headers, seqs):
            f.write(f">{h}\n")
            for line in wrap(s.upper(), line_width):
                f.write(line + "\n")

def discover_families(input_dir: Path) -> List[str]:
    """Find all CSVs matching *_codes.csv and return family names."""
    fams = []
    for p in input_dir.glob("*_codes.csv"):
        name = p.stem
        if name.endswith("_codes"):
            fams.append(name[:-6])  # strip suffix "_codes"
    return sorted(set(fams))

def process_family(family: str) -> None:
    print(f"\n=== {family} ===")

    # Robust skip: only skip if FASTA exists AND has at least 1 record
    out_fa = OUTPUT_DIR / f"{family}.fasta"
    n_existing = fasta_record_count(out_fa)
    if n_existing > 0:
        print(f"[SKIP] {family} already processed: {out_fa} ({n_existing} records).")
        return
    elif out_fa.exists():
        print(f"[INFO] {out_fa} exists but contains 0 records — reprocessing.")

    csv_path = INPUT_DIR / f"{family}_codes.csv"
    if not csv_path.exists():
        print(f"[WARN] Missing: {csv_path}")
        return

    df = pd.read_csv(csv_path)
    # Be tolerant to slight column naming differences (e.g. 'codes')
    col = None
    for cand in ["Codes", "codes", "ACCESSION", "accession", "Accession"]:
        if cand in df.columns:
            col = cand
            break
    if col is None:
        raise ValueError(f"No accession column found in {csv_path}. Columns present: {list(df.columns)}")

    accessions = (
        df[col]
        .dropna()
        .astype(str)
        .str.strip()
        .replace("", pd.NA)
        .dropna()
        .unique()
    )
    print(f"Found {len(accessions)} unique accessions in {csv_path.name}")

    if len(accessions) == 0:
        print("[WARN] No accessions after cleaning; nothing to do.")
        return

    all_chunks: List[str] = []
    failed: List[str] = []

    for i, acc in enumerate(accessions, 1):
        try:
            chunks = retrieve_cds_sequences(acc)
            if chunks:
                all_chunks.extend(chunks)
            else:
                failed.append(acc)
        except Exception as e:
            print(f"[ERR] {acc}: {e}")
            failed.append(acc)

        if i % 25 == 0 or i == len(accessions):
            print(f"  …processed {i}/{len(accessions)}")

    if not all_chunks:
        print("[INFO] No CDS records returned for this family.")
    else:
        metas, seqs = process_fasta_chunks(all_chunks)
        print(f"Writing {len(seqs)} CDS entries to FASTA")
        write_fasta(metas, seqs, out_fa)
        final_cnt = fasta_record_count(out_fa)
        print(f"[OK] Wrote {out_fa} with {final_cnt} records.")

    # Log failures (if any)
    fail_path = OUTPUT_DIR / f"{family}_failed_accessions.txt"
    if failed:
        with fail_path.open("w") as f:
            f.write("\n".join(failed) + "\n")
        print(f"[WARN] {len(failed)} accessions failed. Logged to {fail_path}")
    else:
        # If a previous fail file exists but now there are none, remove to keep tidy
        if fail_path.exists():
            try:
                fail_path.unlink()
            except Exception:
                pass
        print("[OK] No failed accessions.")

# ----------------- Run in Jupyter -----------------
if VIRUS_FAMILIES is None:
    families = discover_families(INPUT_DIR)
    print(f"Discovered {len(families)} families from {INPUT_DIR}:", families)
else:
    families = VIRUS_FAMILIES
    print("Using specified families:", families)

for fam in families:
    process_family(fam)

print("\nDone.")



KeyboardInterrupt



tried to fix the metadata, also check for api key

In [None]:
# === Retrieve GenBank CDS FASTAs by accession (robust FASTA parsing + checks) ===
from __future__ import annotations

import time, re
from pathlib import Path
from typing import List, Optional, Tuple

import pandas as pd
from Bio import Entrez
from urllib.error import HTTPError

# --------------- CONFIG (edit these) ---------------
ENTREZ_EMAIL   = "inah2@cam.ac.uk"
ENTREZ_API_KEY = None  # or paste your key string here e.g. "fe0657db7c3e1518b265167f026e11288a07"

# Input CSVs like "Sedereo_codes.csv" with a column of accessions (e.g. "Codes")
INPUT_DIR  = Path("/Users/ishaharris/Projects/ribolings/data/virus/codes/rna")

# Output folder for combined FASTAs + logs
OUTPUT_DIR = Path("/Users/ishaharris/Projects/ribolings/data/virus/cds_fasta/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Families to process (None = auto-discover all *_codes.csv in INPUT_DIR)
VIRUS_FAMILIES: Optional[List[str]] = None

# Throttle between requests (0.2s is fine without a key; 0.1–0.2s with a key)
SLEEP = 0.2

# Progress cadence and skip rule
PROGRESS_EVERY = 25
MIN_RECORDS_TO_SKIP = 1
# ------------- /CONFIG ----------------

# Apply Entrez credentials + validate API key
Entrez.email = ENTREZ_EMAIL
_key = (ENTREZ_API_KEY or "")
_key = _key.strip() if isinstance(_key, str) else ""
if _key and re.fullmatch(r"[A-Za-z0-9\-]{20,80}", _key):
    Entrez.api_key = _key
    print(f"[INFO] Using NCBI API key (len={len(_key)}).")
else:
    Entrez.api_key = None
    if _key:
        print(f"[WARN] API key looks invalid (len={len(_key)}). Not using it.")
    else:
        print("[INFO] No API key set; using default quota (~3 req/s).")

# ---------- Helpers ----------
def fasta_record_count(path: Path) -> int:
    """Count FASTA records (lines starting with '>')."""
    try:
        if not path.exists() or path.stat().st_size == 0:
            return 0
        return sum(1 for line in path.open() if line.startswith(">"))
    except Exception:
        return 0

def entrez_search_nuccore(accession: str) -> List[str]:
    """Search nuccore by accession; return a list of IDs (may be empty)."""
    query = f"{accession}[accession]"
    with Entrez.esearch(db="nuccore", term=query) as handle:  # XML default
        data = Entrez.read(handle)
    return data.get("IdList", []) or []

def efetch_cds(nuccore_id: str) -> Optional[str]:
    """Fetch CDS nucleotide FASTA text for a single nuccore id; return None if not available."""
    try:
        with Entrez.efetch(db="nuccore", id=nuccore_id, rettype="fasta_cds_na", retmode="text") as h:
            return h.read()
    except HTTPError as e:
        # 400 typically means no CDS view for this record (e.g., wrong type / no CDS features)
        if e.code == 400:
            return None
        raise

def retrieve_cds_records(accession: str) -> Optional[List[str]]:
    """
    Search by accession; fetch rettype=fasta_cds_na for the first hit.
    Return a list of raw FASTA records (each starting with '>'), or None if none.
    """
    ids = entrez_search_nuccore(accession)
    time.sleep(SLEEP)
    if not ids:
        return None

    blob = efetch_cds(ids[0])
    time.sleep(SLEEP)
    if not blob:
        return None

    # Split into individual FASTA records; keep leading '>'
    records = [">" + chunk for chunk in blob.split("\n>") if chunk.strip()]
    return records or None

def parse_fasta_records(fasta_records: List[str]) -> Tuple[List[str], List[str]]:
    """
    Robust FASTA parsing:
      - header = first line after '>'
      - sequence = concat of remaining lines with whitespace removed
    Returns (headers, sequences).
    """
    headers, seqs = [], []
    for rec in fasta_records:
        rec = rec.lstrip()
        if not rec.startswith(">"):
            continue
        lines = rec.splitlines()
        header = lines[0][1:].strip()  # drop '>'
        seq = "".join(lines[1:]).upper()
        # remove whitespace and carriage returns
        seq = re.sub(r"\s+", "", seq)
        if seq:
            headers.append(header)
            seqs.append(seq)
    return headers, seqs

def write_fasta(headers: List[str], seqs: List[str], out_path: Path, line_width: int = 80) -> None:
    """Write a multi-record FASTA; each header goes after '>'."""
    assert len(headers) == len(seqs)
    def wrap(s: str, w: int) -> List[str]:
        return [s[i:i+w] for i in range(0, len(s), w)]
    with out_path.open("w") as f:
        for h, s in zip(headers, seqs):
            f.write(f">{h}\n")
            for line in wrap(s, line_width):
                f.write(line + "\n")

def discover_families(input_dir: Path) -> List[str]:
    """Auto-discover all families with *_codes.csv files."""
    fams = []
    for p in input_dir.glob("*_codes.csv"):
        name = p.stem
        if name.endswith("_codes"):
            fams.append(name[:-6])
    return sorted(set(fams))

def process_family(family: str) -> None:
    print(f"\n=== {family} ===")
    out_fa = OUTPUT_DIR / f"{family}.fasta"

    # Skip if we already have a non-empty FASTA
    n_existing = fasta_record_count(out_fa)
    if n_existing >= MIN_RECORDS_TO_SKIP:
        print(f"[SKIP] {family} already processed: {out_fa} ({n_existing} records).")
        return
    elif out_fa.exists():
        print(f"[INFO] {out_fa} exists but has {n_existing} records — reprocessing.")

    csv_path = INPUT_DIR / f"{family}_codes.csv"
    if not csv_path.exists():
        print(f"[WARN] Missing: {csv_path}")
        return

    df = pd.read_csv(csv_path)
    col = next((c for c in ["Codes","codes","ACCESSION","accession","Accession"] if c in df.columns), None)
    if col is None:
        raise ValueError(f"No accession column found in {csv_path}. Columns present: {list(df.columns)}")

    accessions = (
        df[col]
        .dropna()
        .astype(str)
        .str.strip()
        .replace("", pd.NA)
        .dropna()
        .unique()
    )
    total = len(accessions)
    print(f"Found {total} unique accessions in {csv_path.name}")
    if total == 0:
        print("[WARN] No accessions after cleaning; nothing to do.")
        return

    # Fetch + parse
    all_records: List[str] = []
    failed: List[str] = []
    t0 = time.time()
    processed = 0

    for acc in accessions:
        try:
            recs = retrieve_cds_records(acc)
            if recs:
                all_records.extend(recs)
            else:
                failed.append(acc)
        except Exception as e:
            print(f"[ERR] {acc}: {e}")
            failed.append(acc)

        processed += 1
        if (processed % PROGRESS_EVERY == 0) or (processed == total):
            rate_total = processed / max(1e-9, (time.time() - t0))
            print(f"  …processed {processed}/{total} (~{rate_total:.2f} acc/s overall)")

    elapsed = time.time() - t0
    if elapsed > 0:
        print(f"[STATS] {family}: {processed} accessions in {elapsed:.1f}s ≈ {processed/elapsed:.2f} acc/s")

    if not all_records:
        print("[INFO] No CDS records returned for this family.")
        # still log failures if any
        if failed:
            fail_path = OUTPUT_DIR / f"{family}_failed_accessions.txt"
            with fail_path.open("w") as f:
                f.write("\n".join(failed) + "\n")
            print(f"[WARN] {len(failed)} accessions failed. Logged to {fail_path}")
        return

    # Parse into headers + sequences
    metas, seqs = parse_fasta_records(all_records)
    print(f"Writing {len(seqs)} CDS entries to FASTA")
    write_fasta(metas, seqs, out_fa)
    print(f"[OK] Wrote {out_fa} with {fasta_record_count(out_fa)} records.")

    # ---- Sanity checks ----
    # 1) Non-standard metadata: headers missing [gbkey=CDS]
    nonstandard = [m for m in metas if "[gbkey=CDS]" not in m]
    if nonstandard:
        warn_path = OUTPUT_DIR / f"{family}_nonstandard_headers.txt"
        with warn_path.open("w") as f:
            f.write("\n".join(nonstandard) + "\n")
        print(f"[WARN] {len(nonstandard)} headers missing [gbkey=CDS]. Logged to {warn_path}")
    else:
        print("[OK] All headers contained [gbkey=CDS].")

    # 2) Non-ACGTN characters in sequences (e.g., ambiguity beyond N)
    bad_seq_idx = [i for i, s in enumerate(seqs) if re.search(r"[^ACGTN]", s)]
    if bad_seq_idx:
        bad_path = OUTPUT_DIR / f"{family}_nonACGTN_sequences.txt"
        with bad_path.open("w") as f:
            for i in bad_seq_idx:
                f.write(f">{metas[i]}\n{seqs[i]}\n")
        print(f"[WARN] {len(bad_seq_idx)} sequences contain chars outside A/C/G/T/N. Logged to {bad_path}")
    else:
        print("[OK] All sequences A/C/G/T/N only.")

    # Log failures (if any)
    if failed:
        fail_path = OUTPUT_DIR / f"{family}_failed_accessions.txt"
        with fail_path.open("w") as f:
            f.write("\n".join(failed) + "\n")
        print(f"[WARN] {len(failed)} accessions with no CDS returned. Logged to {fail_path}")
    else:
        cleanup = OUTPUT_DIR / f"{family}_failed_accessions.txt"
        if cleanup.exists():
            try: cleanup.unlink()
            except Exception: pass
        print("[OK] No failed accessions.")

# ----------------- Run -----------------
if VIRUS_FAMILIES is None:
    families = discover_families(INPUT_DIR)
    print(f"Discovered {len(families)} families from {INPUT_DIR}:", families)
else:
    families = VIRUS_FAMILIES
    print("Using specified families:", families)

for fam in families:
    process_family(fam)

print("\nDone.")


[INFO] No API key set; using default quota (~3 req/s).
Discovered 23 families from /Users/ishaharris/Projects/ribolings/data/virus/codes/rna: ['Arena', 'Arteri', 'Astro', 'Birna', 'Borna', 'Calici', 'Corona', 'DNA', 'Filo', 'Flavi', 'Hanta', 'Hepe', 'Kolmio', 'Noda', 'Orthomyxo', 'Paramyxo', 'Peribunya', 'Picorna', 'Pneumo', 'Rhabdo', 'Sedereo', 'Spinareo', 'Tobani']

=== Arena ===
Found 137 unique accessions in Arena_codes.csv
  …processed 25/137 (~0.63 acc/s overall)
  …processed 50/137 (~0.65 acc/s overall)
  …processed 75/137 (~0.63 acc/s overall)
  …processed 100/137 (~0.64 acc/s overall)
  …processed 125/137 (~0.64 acc/s overall)
  …processed 137/137 (~0.64 acc/s overall)
[STATS] Arena: 137 accessions in 213.2s ≈ 0.64 acc/s
Writing 249 CDS entries to FASTA
[OK] Wrote /Users/ishaharris/Projects/ribolings/data/virus/cds_fasta/Arena.fasta with 249 records.
[OK] All headers contained [gbkey=CDS].
[WARN] 11 sequences contain chars outside A/C/G/T/N. Logged to /Users/ishaharris/Project

KeyboardInterrupt: 

# THIS ONE WORKS

In [3]:
# === Retrieve GenBank CDS FASTAs by accession (robust parsing + checks + tester) ===
from __future__ import annotations

import time, re
from pathlib import Path
from typing import List, Optional, Tuple

import pandas as pd
from Bio import Entrez
from urllib.error import HTTPError

# --------------- CONFIG (edit these) ---------------
ENTREZ_EMAIL   = "inah2@cam.ac.uk"
ENTREZ_API_KEY = None  # or paste your key string, e.g. "fe0657db7c3e1518b265167f026e11288a07"

# Input CSVs like "Sedereo_codes.csv" with a column of accessions (e.g. "Codes")
INPUT_DIR  = Path("/Users/ishaharris/Projects/ribolings/data/virus/codes/dna")

# Output folder for combined FASTAs + logs
OUTPUT_DIR = Path("/Users/ishaharris/Projects/ribolings/data/virus/dna_cds_fasta/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Families to process (None = auto-discover all *_codes.csv in INPUT_DIR)
VIRUS_FAMILIES: Optional[List[str]] = None

# Throttle between requests
SLEEP = 0.2  # ~3 req/s without key; ~0.1–0.2 with key

# Progress + skip rules
PROGRESS_EVERY = 25
MIN_RECORDS_TO_SKIP = 1
# ------------- /CONFIG ----------------

# Apply Entrez credentials + validate API key
Entrez.email = ENTREZ_EMAIL
_key = (ENTREZ_API_KEY or "")
_key = _key.strip() if isinstance(_key, str) else ""
if _key and re.fullmatch(r"[A-Za-z0-9\-]{20,80}", _key):
    Entrez.api_key = _key
    print(f"[INFO] Using NCBI API key (len={len(_key)}).")
else:
    Entrez.api_key = None
    if _key:
        print(f"[WARN] API key looks invalid (len={len(_key)}). Not using it.")
    else:
        print("[INFO] No API key set; using default quota (~3 req/s).")

# ---------- Helpers ----------
def fasta_record_count(path: Path) -> int:
    """Count FASTA records (lines starting with '>')."""
    try:
        if not path.exists() or path.stat().st_size == 0:
            return 0
        return sum(1 for line in path.open() if line.startswith(">"))
    except Exception:
        return 0

def entrez_search_nuccore(accession: str) -> List[str]:
    """Search nuccore by accession; return a list of IDs (may be empty)."""
    query = f"{accession}[accession]"
    with Entrez.esearch(db="nuccore", term=query) as handle:  # XML default
        data = Entrez.read(handle)
    return data.get("IdList", []) or []

def efetch_cds(nuccore_id: str) -> Optional[str]:
    """Fetch CDS nucleotide FASTA text for a single nuccore id; return None if not available."""
    try:
        with Entrez.efetch(db="nuccore", id=nuccore_id, rettype="fasta_cds_na", retmode="text") as h:
            return h.read()
    except HTTPError as e:
        # 400 typically means no CDS view for this record (e.g., wrong type / no CDS features)
        if e.code == 400:
            return None
        raise

def split_fasta_blob(blob: str) -> List[str]:
    """
    Split a multi-FASTA blob into individual records, preserving exactly one leading '>' per record.
    """
    records = []
    current = []
    for line in blob.splitlines():
        if line.startswith(">"):
            if current:
                records.append("\n".join(current))
            current = [line]  # start new record with header line
        else:
            current.append(line)
    if current:
        records.append("\n".join(current))
    return [r for r in records if r.strip()]

def retrieve_cds_records(accession: str) -> Optional[List[str]]:
    """
    Search by accession; fetch rettype=fasta_cds_na for the first hit.
    Return a list of raw FASTA records (each starting with '>'), or None if none.
    """
    ids = entrez_search_nuccore(accession)
    time.sleep(SLEEP)
    if not ids:
        return None

    blob = efetch_cds(ids[0])
    time.sleep(SLEEP)
    if not blob:
        return None

    return split_fasta_blob(blob) or None

def parse_fasta_records(fasta_records: List[str]) -> Tuple[List[str], List[str]]:
    """
    Robust FASTA parsing:
      - header = first line after '>'
      - sequence = concat of remaining lines with whitespace removed
    Returns (headers, sequences).
    """
    headers, seqs = [], []
    for rec in fasta_records:
        rec = rec.lstrip()
        if not rec.startswith(">"):
            continue
        lines = rec.splitlines()
        header = lines[0][1:].strip()  # drop '>'
        seq = re.sub(r"\s+", "", "".join(lines[1:]).upper())
        if seq:
            headers.append(header)
            seqs.append(seq)
    return headers, seqs

def write_fasta(headers: List[str], seqs: List[str], out_path: Path, line_width: int = 80) -> None:
    """Write a multi-record FASTA; each header goes after '>'."""
    assert len(headers) == len(seqs)
    def wrap(s: str, w: int) -> List[str]:
        return [s[i:i+w] for i in range(0, len(s), w)]
    with out_path.open("w") as f:
        for h, s in zip(headers, seqs):
            f.write(f">{h}\n")
            for line in wrap(s, line_width):
                f.write(line + "\n")

def discover_families(input_dir: Path) -> List[str]:
    """Auto-discover all families with *_codes.csv files."""
    fams = []
    for p in input_dir.glob("*_codes.csv"):
        name = p.stem
        if name.endswith("_codes"):
            fams.append(name[:-6])
    return sorted(set(fams))

def load_accessions(csv_path: Path) -> List[str]:
    df = pd.read_csv(csv_path)
    col = next((c for c in ["Codes","codes","ACCESSION","accession","Accession"] if c in df.columns), None)
    if col is None:
        raise ValueError(f"No accession column found in {csv_path}. Columns present: {list(df.columns)}")
    accessions = (
        df[col]
        .dropna()
        .astype(str)
        .str.strip()
        .replace("", pd.NA)
        .dropna()
        .unique()
        .tolist()
    )
    return accessions

def process_family(family: str) -> None:
    print(f"\n=== {family} ===")
    out_fa = OUTPUT_DIR / f"{family}.fasta"

    # Skip if we already have a non-empty FASTA
    n_existing = fasta_record_count(out_fa)
    if n_existing >= MIN_RECORDS_TO_SKIP:
        print(f"[SKIP] {family} already processed: {out_fa} ({n_existing} records).")
        return
    elif out_fa.exists():
        print(f"[INFO] {out_fa} exists but has {n_existing} records — reprocessing.")

    csv_path = INPUT_DIR / f"{family}_codes.csv"
    if not csv_path.exists():
        print(f"[WARN] Missing: {csv_path}")
        return

    accessions = load_accessions(csv_path)
    total = len(accessions)
    print(f"Found {total} unique accessions in {csv_path.name}")
    if total == 0:
        print("[WARN] No accessions after cleaning; nothing to do.")
        return

    # Fetch + parse
    all_records: List[str] = []
    failed: List[str] = []
    t0 = time.time()
    processed = 0

    for acc in accessions:
        try:
            recs = retrieve_cds_records(acc)
            if recs:
                all_records.extend(recs)
            else:
                failed.append(acc)
        except Exception as e:
            print(f"[ERR] {acc}: {e}")
            failed.append(acc)

        processed += 1
        if (processed % PROGRESS_EVERY == 0) or (processed == total):
            rate_total = processed / max(1e-9, (time.time() - t0))
            print(f"  …processed {processed}/{total} (~{rate_total:.2f} acc/s overall)")

    elapsed = time.time() - t0
    if elapsed > 0:
        print(f"[STATS] {family}: {processed} accessions in {elapsed:.1f}s ≈ {processed/elapsed:.2f} acc/s")

    if not all_records:
        print("[INFO] No CDS records returned for this family.")
        if failed:
            fail_path = OUTPUT_DIR / f"{family}_failed_accessions.txt"
            with fail_path.open("w") as f:
                f.write("\n".join(failed) + "\n")
            print(f"[WARN] {len(failed)} accessions failed. Logged to {fail_path}")
        return

    # Parse into headers + sequences
    metas, seqs = parse_fasta_records(all_records)
    print(f"Writing {len(seqs)} CDS entries to FASTA")
    write_fasta(metas, seqs, out_fa)
    print(f"[OK] Wrote {out_fa} with {fasta_record_count(out_fa)} records.")

    # ---- Sanity checks ----
    nonstandard = [m for m in metas if "[gbkey=CDS]" not in m]
    if nonstandard:
        warn_path = OUTPUT_DIR / f"{family}_nonstandard_headers.txt"
        with warn_path.open("w") as f:
            f.write("\n".join(nonstandard) + "\n")
        print(f"[WARN] {len(nonstandard)} headers missing [gbkey=CDS]. Logged to {warn_path}")
    else:
        print("[OK] All headers contained [gbkey=CDS].")

    bad_seq_idx = [i for i, s in enumerate(seqs) if re.search(r"[^ACGTN]", s)]
    if bad_seq_idx:
        bad_path = OUTPUT_DIR / f"{family}_nonACGTN_sequences.txt"
        with bad_path.open("w") as f:
            for i in bad_seq_idx:
                f.write(f">{metas[i]}\n{seqs[i]}\n")
        print(f"[WARN] {len(bad_seq_idx)} sequences contain chars outside A/C/G/T/N. Logged to {bad_path}")
    else:
        print("[OK] All sequences A/C/G/T/N only.")

    if failed:
        fail_path = OUTPUT_DIR / f"{family}_failed_accessions.txt"
        with fail_path.open("w") as f:
            f.write("\n".join(failed) + "\n")
        print(f"[WARN] {len(failed)} accessions with no CDS returned. Logged to {fail_path}")
    else:
        cleanup = OUTPUT_DIR / f"{family}_failed_accessions.txt"
        if cleanup.exists():
            try: cleanup.unlink()
            except Exception: pass
        print("[OK] No failed accessions.")

# -------- Tester: process only first N accessions and write to {family}_TEST{N}.fasta --------
def test_family(family: str, n: int = 25) -> None:
    print(f"\n=== TEST {family} (first {n} accessions) ===")
    csv_path = INPUT_DIR / f"{family}_codes.csv"
    if not csv_path.exists():
        print(f"[WARN] Missing: {csv_path}")
        return

    accessions = load_accessions(csv_path)[:n]
    print(f"Testing with {len(accessions)} accessions from {csv_path.name}")

    all_records: List[str] = []
    failed: List[str] = []
    t0 = time.time()

    for acc in accessions:
        try:
            recs = retrieve_cds_records(acc)
            if recs:
                all_records.extend(recs)
            else:
                failed.append(acc)
        except Exception as e:
            print(f"[ERR] {acc}: {e}")
            failed.append(acc)

    elapsed = time.time() - t0
    print(f"[STATS] TEST {family}: {len(accessions)} accessions in {elapsed:.1f}s")

    if not all_records:
        print("[INFO] No CDS records returned in test.")
        if failed:
            print(f"[WARN] {len(failed)} failed in test (first few): {failed[:5]}")
        return

    metas, seqs = parse_fasta_records(all_records)
    out_fa = OUTPUT_DIR / f"{family}_TEST{n}.fasta"
    write_fasta(metas, seqs, out_fa)
    print(f"[OK] Test wrote {out_fa} with {fasta_record_count(out_fa)} records.")
    # Show a peek at first 2 headers
    print("Example headers:")
    for h in metas[:2]:
        print("  >", h)

# ----------------- Run -----------------
if VIRUS_FAMILIES is None:
    families = discover_families(INPUT_DIR)
    print(f"Discovered {len(families)} families from {INPUT_DIR}:", families)
else:
    families = VIRUS_FAMILIES
    print("Using specified families:", families)

# Example usage:
test_family("Arena", n=5)   # quick formatting check
for fam in families:           # full run
     process_family(fam)


[INFO] No API key set; using default quota (~3 req/s).
Discovered 10 families from /Users/ishaharris/Projects/ribolings/data/virus/codes/dna: ['Adeno', 'Anello', 'Asfar', 'Circo', 'Herpes', 'Irido', 'Papilloma', 'Parvo', 'Polyomar', 'Pox']

=== TEST Arena (first 5 accessions) ===
[WARN] Missing: /Users/ishaharris/Projects/ribolings/data/virus/codes/dna/Arena_codes.csv

=== Adeno ===
Found 10 unique accessions in Adeno_codes.csv
  …processed 10/10 (~0.65 acc/s overall)
[STATS] Adeno: 10 accessions in 15.5s ≈ 0.65 acc/s
Writing 296 CDS entries to FASTA
[OK] Wrote /Users/ishaharris/Projects/ribolings/data/virus/dna_cds_fasta/Adeno.fasta with 296 records.
[OK] All headers contained [gbkey=CDS].
[OK] All sequences A/C/G/T/N only.
[OK] No failed accessions.

=== Anello ===
Found 1 unique accessions in Anello_codes.csv
  …processed 1/1 (~0.81 acc/s overall)
[STATS] Anello: 1 accessions in 1.2s ≈ 0.81 acc/s
Writing 3 CDS entries to FASTA
[OK] Wrote /Users/ishaharris/Projects/ribolings/data/vir