Cross-checking against Ensembl (CDS + cDNA via REST)

	•	Fetching each transcript’s CDS and cDNA from Ensembl’s REST API (tolerating IDs with/without version).
	•	Comparing your orf exactly to the Ensembl CDS; then locating that CDS inside the cDNA and taking the suffix as an inferred 3’UTR, comparing it to your utr3.
	•	Flagging any mismatches with a short “first‑difference” hint and verifying the recorded lengths.
	•	Using this as an online sanity check before downstream feature generation.

In [None]:
# === Cross-check transcripts.txt against Ensembl REST (CDS + cDNA) ===
# Requirements: internet; 'transcripts.txt' in current dir.

import pandas as pd, requests, textwrap

ENSEMBL_BASE = "https://rest.ensembl.org"

def fetch_seq_by_id(enst, seq_type):
    """
    seq_type ∈ {'cds','cdna','protein','genomic'}
    Returns plain DNA/AA string (no FASTA header).
    """
    url = f"{ENSEMBL_BASE}/sequence/id/{enst}?type={seq_type}"
    r = requests.get(url, headers={"Content-Type":"text/plain"})
    r.raise_for_status()
    txt = r.text.strip()
    # If REST answered in FASTA, strip header; usually plain text for this endpoint
    if txt.startswith(">"):
        lines = txt.splitlines()
        return "".join(lines[1:]).strip()
    return txt

def first_mismatch(a, b, context=30):
    """Return a short message locating the first mismatch between strings a and b."""
    n = min(len(a), len(b))
    for i in range(n):
        if a[i] != b[i]:
            start = max(0, i-context)
            end   = min(n, i+context)
            return f"pos {i} (0-based): expected[{a[i]}] vs got[{b[i]}]\n" + \
                   f"...{a[start:end]} (expected)\n...{b[start:end]} (got)"
    if len(a) != len(b):
        return f"length differs: expected {len(a)} vs got {len(b)}"
    return None

# Load your file (the version that matches the CNN input format)
df = pd.read_csv("transcripts.txt", sep="\t")  # columns: transcript, orf, orf_length, utr3, utr3_length, orf_utr3

rows = []
for _, r in df.iterrows():
    enst = str(r["transcript"])
    # Many files include version (e.g., ENST00000240185.8). REST accepts either.
    # We'll try exact first, then fall back to base ID without version.
    try_ids = [enst]
    if "." in enst:
        try_ids.append(enst.split(".", 1)[0])

    cds_rest = cdna_rest = None
    last_err = None
    for tid in try_ids:
        try:
            cds_rest  = fetch_seq_by_id(tid, "cds")
            cdna_rest = fetch_seq_by_id(tid, "cdna")
            enst_used = tid
            break
        except Exception as e:
            last_err = e
            continue
    if cds_rest is None or cdna_rest is None:
        rows.append({
            "transcript": enst,
            "ok": False,
            "reason": f"REST fetch failed ({last_err})"
        })
        continue

    # Compare ORF (your orf) to Ensembl CDS
    orf_ok = (r["orf"] == cds_rest)
    orf_reason = "" if orf_ok else first_mismatch(r["orf"], cds_rest)

    # Infer 3'UTR from cDNA by locating CDS inside cDNA and taking the suffix
    idx = cdna_rest.find(cds_rest)
    if idx == -1:
        utr_ok = False
        utr_reason = "CDS not found inside cDNA (unexpected)"
        inferred_utr3 = ""
    else:
        inferred_utr3 = cdna_rest[idx+len(cds_rest):]  # suffix after CDS
        utr_ok = (inferred_utr3 == r["utr3"])
        utr_reason = "" if utr_ok else first_mismatch(inferred_utr3, r["utr3"])

    rows.append({
        "transcript": enst,
        "orf_matches_Ensembl_CDS": orf_ok,
        "orf_mismatch_hint": orf_reason,
        "utr3_matches_inferred_from_cDNA": utr_ok,
        "utr3_mismatch_hint": utr_reason,
        "lens_ok": (len(r["orf"]) == int(r["orf_length"])) and (len(r["utr3"]) == int(r["utr3_length"])),
        "used_id_for_rest": enst_used
    })

report = pd.DataFrame(rows)
print("Verification summary:")
print(report[["transcript","used_id_for_rest","orf_matches_Ensembl_CDS","utr3_matches_inferred_from_cDNA","lens_ok"]])

problems = report[(~report["orf_matches_Ensembl_CDS"]) | (~report["utr3_matches_inferred_from_cDNA"]) | (~report["lens_ok"])]
if len(problems):
    print("\nDetails for problems:")
    display(problems)
else:
    print("\nAll good — ORFs match Ensembl CDS and 3'UTRs match cDNA-suffix for all rows.")