In [1]:
"""
DNAlytics: An Interactive DNA Sequence Analysis Tool
Author: Javeria Butt
Features: DNA validation, nucleotide stats, transcription, translation, motif search, NCBI integration, plots, and CSV export.
"""


'\nDNAlytics: An Interactive DNA Sequence Analysis Tool\nAuthor: Javeria Butt\nFeatures: DNA validation, nucleotide stats, transcription, translation, motif search, NCBI integration, plots, and CSV export.\n'

# **1) Setup (install + imports)**

In [2]:
# === 1) Installing libraries (first run only) ===
!pip install biopython ipywidgets --quiet

# Enabling widgets UI in Colab
from google.colab import output as _colab_output
_colab_output.enable_custom_widget_manager()

# === Imports ===
import os, zipfile, textwrap
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from ipywidgets import widgets, Layout, HBox, VBox
from IPython.display import display, clear_output
from Bio import Entrez, SeqIO  #to access NCBI data


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m178.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# **2) Core Analysis Functions**

In [3]:
# === 2) Core analysis functions ===

VALID_BASES = set("ATGC")

CODON_TABLE = {
    'ATA':'I','ATC':'I','ATT':'I','ATG':'M',
    'ACA':'T','ACC':'T','ACG':'T','ACT':'T',
    'AAC':'N','AAT':'N','AAA':'K','AAG':'K',
    'AGC':'S','AGT':'S','AGA':'R','AGG':'R',
    'CTA':'L','CTC':'L','CTG':'L','CTT':'L',
    'CCA':'P','CCC':'P','CCG':'P','CCT':'P',
    'CAC':'H','CAT':'H','CAA':'Q','CAG':'Q',
    'CGA':'R','CGC':'R','CGG':'R','CGT':'R',
    'GTA':'V','GTC':'V','GTG':'V','GTT':'V',
    'GCA':'A','GCC':'A','GCG':'A','GCT':'A',
    'GAC':'D','GAT':'D','GAA':'E','GAG':'E',
    'GGA':'G','GGC':'G','GGG':'G','GGT':'G',
    'TCA':'S','TCC':'S','TCG':'S','TCT':'S',
    'TTC':'F','TTT':'F','TTA':'L','TTG':'L',
    'TAC':'Y','TAT':'Y','TAA':'_','TAG':'_',
    'TGC':'C','TGT':'C','TGA':'_','TGG':'W',
}

def validate_dna(seq: str) -> str:
    seq = (seq or "").upper().replace("\n", "").replace(" ", "")
    if not seq:
        raise ValueError("Empty sequence.")
    if not set(seq).issubset(VALID_BASES):
        raise ValueError("Invalid DNA sequence! Only A/T/G/C allowed.")
    return seq

def base_counts(seq: str):
    c = Counter(seq)
    return c.get("A",0), c.get("T",0), c.get("G",0), c.get("C",0)

def gc_content(seq: str) -> float:
    g = seq.count("G"); c = seq.count("C")
    return round(((g+c)/len(seq))*100, 2)

def complement(seq: str) -> str:
    return seq.translate(str.maketrans("ATGC","TACG"))

def reverse_complement(seq: str) -> str:
    return complement(seq)[::-1]

def rna_transcript(seq: str) -> str:
    return seq.replace("T","U")

def translate(seq: str):
    """Return codon list and amino acid list (frame 1)."""
    codons, aas = [], []
    for i in range(0, len(seq)-2, 3):
        codon = seq[i:i+3]
        codons.append(codon)
        aas.append(CODON_TABLE.get(codon, "X"))
    return codons, aas

def motif_positions(seq: str, motif: str):
    """1-based positions (overlapping)"""
    positions = []
    if not motif:
        return positions
    motif = motif.upper()
    i = 0
    while True:
        j = seq.find(motif, i)
        if j == -1:
            break
        positions.append(j+1)
        i = j + 1
    return positions

def start_stop_positions(seq: str):
    starts = []
    stops = []
    for i in range(0, len(seq)-2, 3):
        codon = seq[i:i+3]
        if codon == "ATG":
            starts.append(i+1)
        if codon in ("TAA","TAG","TGA"):
            stops.append(i+1)
    return starts, stops

def freq_counter(items):
    return Counter(items)

def save_csvs(seq, a,t,g,c, gc, comp, revc, rna, codons, aas, codon_freq, aa_freq):
    # Basic stats
    stats_df = pd.DataFrame({
        "Metric":["Length","A","T","G","C","GC Content","Complement","Reverse Complement","RNA"],
        "Value":[len(seq), a, t, g, c, f"{gc}%", comp, revc, rna]
    })
    stats_df.to_csv("dna_analysis_results.csv", index=False)

    # Codon->AA table
    pd.DataFrame({"Codon": codons, "Amino Acid": aas}).to_csv("protein_translation.csv", index=False)

    # Frequencies
    pd.DataFrame(codon_freq.items(), columns=["Codon","Frequency"]).to_csv("codon_frequency.csv", index=False)
    pd.DataFrame(aa_freq.items(), columns=["Amino Acid","Frequency"]).to_csv("amino_acid_frequency.csv", index=False)


# **3) Plot helpers (three separate plots + PNG saves)**

In [4]:
# === 3) Plots (each saves a separate PNG and shows the figure) ===

def plot_nucleotide_counts(a, t, g, c):
    plt.figure(figsize=(6,4))
    plt.bar(["A","T","G","C"], [a,t,g,c], color=["#ff9999","#66b3ff","#99ff99","#ffcc99"])
    plt.title("Nucleotide Composition")
    plt.xlabel("Nucleotide"); plt.ylabel("Count")
    plt.savefig("nucleotide_composition.png", dpi=150, bbox_inches="tight")
    plt.show()

def plot_codon_freq(codon_freq):
    if not codon_freq:
        return
    plt.figure(figsize=(11,4))
    items = sorted(codon_freq.items(), key=lambda x: x[0])
    x = [k for k,_ in items]; y = [v for _,v in items]
    plt.bar(x, y, color="#ffa500")
    plt.title("Codon Frequency")
    plt.xlabel("Codon"); plt.ylabel("Count")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig("codon_frequency.png", dpi=150, bbox_inches="tight")
    plt.show()

def plot_aa_freq(aa_freq):
    if not aa_freq:
        return
    plt.figure(figsize=(8,4))
    items = sorted(aa_freq.items(), key=lambda x: x[0])
    x = [k for k,_ in items]; y = [v for _,v in items]
    plt.bar(x, y, color="#8a2be2")
    plt.title("Amino Acid Frequency")
    plt.xlabel("Amino Acid"); plt.ylabel("Count")
    plt.savefig("amino_acid_frequency.png", dpi=150, bbox_inches="tight")
    plt.show()


# **4) NCBI API helpers (Entrez)**

In [5]:
# === 4) NCBI API helpers ===
# MUST provide a real email to access NCBI.

def set_entrez_email(user_email: str):
    Entrez.email = user_email.strip()

def fetch_by_accession(accession: str):
    """Fetch nucleotide FASTA by accession and return uppercase DNA."""
    accession = accession.strip()
    if not accession:
        raise ValueError("Accession ID is empty.")
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    seq = str(record.seq).upper()
    with open("fetched_sequence.fasta","w") as f:
        f.write(f">{record.id} {record.description}\n")
        for i in range(0, len(seq), 70):
            f.write(seq[i:i+70] + "\n")
    return seq, record.id

def search_and_fetch_gene(gene_symbol: str, organism: str = "Homo sapiens"):
    """
    Search NCBI nucleotide for gene symbol + organism, prioritize mRNA.
    Returns (sequence, first_id) or raises if nothing found.
    """
    term = f'{gene_symbol.strip()}[Gene] AND {organism.strip()}[Organism]'
    # Prefer mRNA if possible
    handle = Entrez.esearch(db="nucleotide", term=term + " AND biomol_mrna[Prop]", retmax=1)
    res = Entrez.read(handle); handle.close()
    ids = res.get("IdList", [])
    if not ids:  # fallback: any nucleotide
        handle = Entrez.esearch(db="nucleotide", term=term, retmax=1)
        res = Entrez.read(handle); handle.close()
        ids = res.get("IdList", [])
    if not ids:
        raise ValueError("No NCBI records found for this gene/organism!")
    rec_id = ids[0]
    handle = Entrez.efetch(db="nucleotide", id=rec_id, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    seq = str(record.seq).upper()
    with open("fetched_sequence.fasta","w") as f:
        f.write(f">{record.id} {record.description}\n")
        for i in range(0, len(seq), 70):
            f.write(seq[i:i+70] + "\n")
    return seq, record.id


# **5. Final Result through Interactive app (ipywidgets)**

In [7]:
# === 5) Interactive app (ipywidgets) ===

# Inputs
email_box     = widgets.Text(value='', placeholder='your_email@example.com', description='NCBI Email:', layout=Layout(width='50%'))
source_dd     = widgets.Dropdown(options=['Paste DNA','NCBI Accession','Gene Search'], value='Paste DNA', description='Input:')
dna_area      = widgets.Textarea(value='', placeholder='Enter DNA sequence (A/T/G/C)', description='DNA:', layout=Layout(width='70%', height='120px'))
acc_box       = widgets.Text(value='', placeholder='e.g., NM_007294.4', description='Accession:', layout=Layout(width='50%'))
gene_box      = widgets.Text(value='', placeholder='e.g., BRCA1', description='Gene:', layout=Layout(width='40%'))
org_box       = widgets.Text(value='Homo sapiens', placeholder='Organism', description='Organism:', layout=Layout(width='40%'))
motif_box     = widgets.Text(value='', placeholder='optional motif, e.g., ATG', description='Motif:', layout=Layout(width='40%'))

analyze_btn   = widgets.Button(description='Analyze', button_style='success', icon='check')
zip_btn       = widgets.Button(description='Create ZIP & Download', button_style='info', icon='download')
out           = widgets.Output()

def _visible_by_source():
    dna_area.layout.display = 'none'
    acc_box.layout.display  = 'none'
    gene_box.layout.display = 'none'
    org_box.layout.display  = 'none'
    if source_dd.value == 'Paste DNA':
        dna_area.layout.display = 'block'
    elif source_dd.value == 'NCBI Accession':
        acc_box.layout.display = 'block'
    elif source_dd.value == 'Gene Search':
        gene_box.layout.display = 'block'
        org_box.layout.display  = 'block'

source_dd.observe(lambda change: _visible_by_source(), names='value')
_visible_by_source()

# State
_last_files = []
_last_sequence_context = ""

def run_analysis(sequence: str, motif: str):
    """Run full pipeline on a validated DNA sequence and display/Save outputs."""
    seq = validate_dna(sequence)
    a,t,g,c = base_counts(seq)
    gc = gc_content(seq)
    comp = complement(seq)
    revc = reverse_complement(seq)
    rna  = rna_transcript(seq)
    codons, aas = translate(seq)
    codon_freq = freq_counter(codons)
    aa_freq    = freq_counter(aas)
    starts, stops = start_stop_positions(seq)
    motif_pos = motif_positions(seq, motif)

    # Console report
    print("\nDNA Analysis Report")
    print("="*60)
    print(f"Length: {len(seq)}")
    print(f"A: {a}  |  T: {t}  |  G: {g}  |  C: {c}")
    print(f"GC Content: {gc}%")
    print("-"*60)
    print(f"Complement:         {comp}")
    print(f"Reverse Complement: {revc}")
    print(f"RNA Transcript:     {rna}")
    print("-"*60)
    print("Protein Translation (codon → amino acid):")
    for c0, a0 in zip(codons, aas):
        print(f"{c0}  →  {a0}")
    if motif:
        print("-"*60)
        print(f"Motif '{motif}' positions (1-based): {motif_pos or 'not found'}")
    print(f"Start codon positions (frame 1): {starts or 'none'}")
    print(f"Stop codon positions  (frame 1): {stops or 'none'}")

    # Save outputs
    save_csvs(seq, a,t,g,c, gc, comp, revc, rna, codons, aas, codon_freq, aa_freq)

    # Plots
    plot_nucleotide_counts(a,t,g,c)
    plot_codon_freq(codon_freq)
    plot_aa_freq(aa_freq)

    # Track files written
    files = [
        "dna_analysis_results.csv",
        "protein_translation.csv",
        "codon_frequency.csv",
        "amino_acid_frequency.csv",
        "nucleotide_composition.png",
        "codon_frequency.png",
        "amino_acid_frequency.png",
    ]
    if os.path.exists("fetched_sequence.fasta"):
        files.append("fetched_sequence.fasta")
    return files

def on_analyze_clicked(_):
    out.clear_output()
    with out:
        try:
            files = []
            global _last_files, _last_sequence_context

            # If using NCBI, require email
            if source_dd.value in ("NCBI Accession","Gene Search"):
                if not email_box.value.strip():
                    print("Please enter your email (required by NCBI Entrez).")
                    return
                set_entrez_email(email_box.value)

            if source_dd.value == "Paste DNA":
                seq = dna_area.value
                _last_sequence_context = "Manual DNA input"
                files = run_analysis(seq, motif_box.value)

            elif source_dd.value == "NCBI Accession":
                seq, rec_id = fetch_by_accession(acc_box.value)
                print(f"Fetched from NCBI by accession: {rec_id}")
                _last_sequence_context = f"NCBI Accession: {rec_id}"
                files = run_analysis(seq, motif_box.value)

            elif source_dd.value == "Gene Search":
                seq, rec_id = search_and_fetch_gene(gene_box.value, org_box.value)
                print(f"Fetched from NCBI by gene search: {rec_id}")
                _last_sequence_context = f"NCBI Gene Search: {gene_box.value} / {org_box.value} → {rec_id}"
                files = run_analysis(seq, motif_box.value)

            _last_files = files
            print("\nFiles saved:")
            for f in files:
                print(" -", f)

        except Exception as e:
            print("Error:", e)

def on_zip_clicked(_):
    out.clear_output(wait=True)
    with out:
        try:
            # Include everything generated + (optionally) a tiny core script for reuse
            core_py = "dna_analyzer_core.py"
            with open(core_py, "w") as f:
                f.write(textwrap.dedent("""\
                    # Minimal reusable core (generated)
                    from collections import Counter
                    CODON_TABLE = %r
                    VALID_BASES = set("ATGC")
                    def validate_dna(seq):
                        seq = (seq or "").upper().replace("\\n","").replace(" ","")
                        if not seq or not set(seq).issubset(VALID_BASES):
                            raise ValueError("Invalid DNA.")
                        return seq
                    def translate(seq):
                        codons, aas = [], []
                        for i in range(0, len(seq)-2, 3):
                            codon = seq[i:i+3]; codons.append(codon)
                            aas.append(CODON_TABLE.get(codon, "X"))
                        return codons, aas
                """ % CODON_TABLE))

            to_zip = set(_last_files or [])
            # Add notebook hint file
            open("README_DNAlytics.txt","w").write(
                "This zip contains analysis outputs from DNAlytics (Colab app).\n"
                f"Context: {_last_sequence_context or 'N/A'}\n"
            )
            to_zip.update([core_py, "README_DNAlytics.txt"])

            zip_name = "DNAlytics_project_Javeria_Butt.zip"
            with zipfile.ZipFile(zip_name, "w") as zf:
                for f in sorted(to_zip):
                    if os.path.exists(f):
                        zf.write(f)
            print(f"Created ZIP: {zip_name}")

            # Auto-download in Colab
            try:
                from google.colab import files as _colab_files
                _colab_files.download(zip_name)
            except Exception:
                print("If download didn't start, use: from google.colab import files; files.download('dna_analytics_project.zip')")

        except Exception as e:
            print("Error creating ZIP:", e)

analyze_btn.on_click(on_analyze_clicked)
zip_btn.on_click(on_zip_clicked)

# Layout
row1 = HBox([source_dd, motif_box])
row2 = HBox([email_box])
row3 = HBox([dna_area])
row4 = HBox([acc_box])
row5 = HBox([gene_box, org_box])
row6 = HBox([analyze_btn, zip_btn])

display(row1, row2, row3, row4, row5, row6, out)


HBox(children=(Dropdown(description='Input:', options=('Paste DNA', 'NCBI Accession', 'Gene Search'), value='P…

HBox(children=(Text(value='', description='NCBI Email:', layout=Layout(width='50%'), placeholder='your_email@e…

HBox(children=(Textarea(value='', description='DNA:', layout=Layout(display='block', height='120px', width='70…

HBox(children=(Text(value='', description='Accession:', layout=Layout(display='none', width='50%'), placeholde…

HBox(children=(Text(value='', description='Gene:', layout=Layout(display='none', width='40%'), placeholder='e.…

HBox(children=(Button(button_style='success', description='Analyze', icon='check', style=ButtonStyle()), Butto…

Output()