In [1]:
from Bio import SeqIO
import re
import pandas as pd
import os
from pathlib import Path
from dataclasses import dataclass

from local.constants import WORKSPACE_ROOT
from local.caching import load, save
ws = Path("./cache/epi300_genotype")
ws.mkdir(exist_ok=True)

In [3]:
k12_gb_path = WORKSPACE_ROOT/"data/pangenome_genbanks/MG1655.gb"
k12_ref_path = WORKSPACE_ROOT/"data/reference_genomes/MG1655/ncbi_dataset/data/GCF_000005845.2/GCF_000005845.2_ASM584v2_genomic.fna"
for e in SeqIO.parse(k12_gb_path, "genbank"):
    for feature in e.features:
        if feature.type != "rep_origin": continue
        print(feature)
        oriC = str(feature.extract(e.seq))
        print(oriC)

type: rep_origin
location: [3925743:3925975](+)
qualifiers:
    Key: note, Value: ['oriC']

GATCTATTTATTTAGAGATCTGTTCTATTGTGATCTCTTATTAGGATCGCACTGCCCTGTGGATAACAAGGATCCGGCTTTTAAGATCAACAACCTGGAAAGGATCATTAACTGTGAATGATCGGTGATCCTGGACCGTATAAGCTGGGATCAGAATGAGGGGTTATACACAACTCAAAAACTGAACAACAGTTGTTCTTTGGATAACTACCGGTTGATCCAAGCTTCCTGA


In [4]:
epi300_asm_path = WORKSPACE_ROOT/"data/assembly/epi300.fna"
for e in SeqIO.parse(epi300_asm_path, "fasta"):
    epi300_desc = e.description
    epi300_seq = str(e.seq)
    break
n_hits = len(re.findall(oriC, epi300_seq))
loc_oriC = epi300_seq.index(oriC)
print(loc_oriC, n_hits)

163192 1


In [5]:
import numpy as np
from local.figures.template import BaseFigure, ApplyTemplate, go

from local.figures.base.layout import Canvas, Panel, Transform
from local.figures.base.coordinates import to_cart
from local.figures.base.geometry import Brush
from local.figures.base.text import TextPlotter
from local.figures.colors import COLORS, Color, Palettes, XColor, ListOfXColor

In [6]:
k12toEpi300 = 827981
genome_length = len(epi300_seq)
genome_length

4691561

In [8]:
df_rrna = pd.read_csv(WORKSPACE_ROOT/"data/annotations/epi300.barrnap.gff", sep="\t", comment="#", header=None)
rrnas = []
for _, r in df_rrna.iterrows():
    loc = r[4]
    meta = r[8].split(";")
    meta = {m.split("=")[0]: m.split("=")[1] for m in meta}
    rrnas.append((meta["product"].split(" ")[0], loc))
len(rrnas)

22

In [9]:
rrna_annots = []
for rna, loc in rrnas:
    found = False
    for seen_loc, lst in rrna_annots:
        if abs(seen_loc-loc) < 10000:
            lst.append(rna)
            found = True
            break
    if not found:
        rrna_annots.append((loc, [rna]))
rrna_annots

[(182133, ['16S', '23S', '5S']),
 (275856, ['16S', '23S', '5S']),
 (407761, ['16S', '23S', '5S']),
 (449249, ['16S', '23S', '5S']),
 (1028733, ['16S', '23S', '5S']),
 (3649374, ['5S', '23S', '16S']),
 (4352708, ['5S', '5S', '23S', '16S'])]

In [11]:
df_trna = pd.read_csv(WORKSPACE_ROOT/"data/annotations/epi300.tRNAscan-se.gff", sep="\t", comment="#", header=None)
trnas = []
for _, r in df_trna.iterrows():
    t = r[2]
    if t != "tRNA": continue
    loc = r[4]
    meta = r[8].split(";")
    meta = {m.split("=")[0]: m.split("=")[1] for m in meta if m}
    name = meta["Name"].split(".")[1]
    trnas.append((name, loc))
len(trnas)

88

In [17]:
from Bio.SeqFeature import ExactPosition, CompoundLocation

def get_pos(e):
    if isinstance(e.location, CompoundLocation):
        parts = list(e.location.parts)
        
        # Calculate total length and find middle
        total_length = sum(len(part) for part in parts)
        middle_offset = total_length // 2
        
        # Find which part contains the middle
        current_pos = 0
        for part in parts:
            part_length = len(part)
            if current_pos + part_length > middle_offset:
                # Middle is in this part
                middle_in_part = middle_offset - current_pos
                middle_position = part.start + middle_in_part
                
                # Handle circular wrap-around
                if middle_position >= genome_length:
                    middle_position -= genome_length
                break
            current_pos += part_length
    else:
        # Simple linear feature
        middle_position = (e.location.start + e.location.end) // 2
    return middle_position

gene2loc = {}
cds_locations = []
for g in SeqIO.parse(WORKSPACE_ROOT/"data/pangenome_genbanks/EPI300.gb", "genbank"):
    for e in g.features:
        if e.type != "CDS": continue
        pos = get_pos(e)
        cds_locations.append(pos)
        _q = e.qualifiers
        key = None
        for k in ["gene", "product"]:
            if k in _q:
                key = _q[k][0]
                break
        assert key is not None
        gene2loc[key] = pos
len(gene2loc), len(cds_locations)

(4159, 4475)

In [16]:
contigs = []
for e in SeqIO.parse(f"{WORKSPACE_ROOT}/data/assembly/epi300.fna", "fasta"):
    contigs.append(np.array([c for c in str(e.seq)]))
assert len(contigs)==1
genome = np.hstack(contigs)
genome.shape

(4691561,)

In [18]:
gc = ((genome =="G")|(genome=="C"))
gc.sum()/gc.shape[0]

np.float64(0.5078058241169624)

In [20]:
def calc_skew(gc: np.ndarray, window: int, step: int):
    assert window%step==0
    w = window//step
    kernel = np.ones(w)/w
    new_length = len(gc)//step
    trunc = step*new_length
    data = gc[:trunc].reshape(-1, step).mean(axis=1)

    buf_start = data[-w:]
    buf_end = data[:w]
    data = np.hstack((buf_start, data, buf_end))
    skew = np.convolve(data, kernel, mode='same')[w:-w]
    return skew

gc_skew = calc_skew(gc, 20000, 10000)
gc_skew.shape

(469,)

In [21]:
fig = BaseFigure()
WIDTH, HEIGHT = 800, 600

BLACK = Color.Hex("212121")
@dataclass
class An:
    desc: str
    location: int # nucleotide location in k12 ref
    rad_start: float = 0
    rad_end: float = 0.05
    fontsize: int = 12
    color: XColor = BLACK
    dx: float=0
    dy: float=0
    r: float=0.005

    def GetAngle(self):
        # loc = self.location + k12toEpi300 - loc_oriC
        loc = self.location - loc_oriC
        while loc < 0: loc += genome_length
        while loc > genome_length: loc -= genome_length
        angle = loc/genome_length * 2*np.pi
        return angle

def get_loc(genes):
    locs = [gene2loc[g] for g in genes]
    return ExactPosition(sum(locs)/len(locs))

nudges = {
    "φ80dlacZΔM15<br>ΔlacX74": (-0.02, -0.02),
    "galU": (0, +0.01),
}

# F- mcrA Δ(mrr-hsdRMS-mcrBC) φ80dlacZΔM15 ΔlacX74 recA1 endA1 araD139 Δ(ara, leu)7697 galU galK λ- rpsL nupG trfA DHFR
r_offset = 0.1
annotations: list[An] = [
    An(name, get_loc(ks)) for ks, name in [
        (["yajD"],          "mcrA"),
        (["opgB", "nanC"],  "Δ(mrr-hsdRMS-mcrBC)"),
        (["lacZ"],          "φ80dlacZΔM15<br>ΔlacX74"),
        (["recA"],          "recA1"),
        (["endA", "nupG"],  "endA1<br>nupG"),
        (["folA"],          "DHFR (folA)<br>araD139<br>Δ(ara, leu)7697"),
        (["galU"],          "galU"),
        (["galK"],          "galK"),
        (["rpsL"],          "rpsL"),
        (["trfA"],          "trfA"),
    ]
] + [
    An("", l, rad_start=-0.03, rad_end=-0.06, color=Palettes.PLOTLY[0], r=0.01) for n, l in trnas
] + [
    An("", l, rad_start=-0.06, rad_end=-0.09, color=Palettes.PLOTLY[4], r=0.01) for l, ns in rrna_annots
] + [
    An("", l, rad_start=0, rad_end=-0.03, r=0.001, color=BLACK) for l in cds_locations
]

for a in annotations:
    if a.desc in nudges:
        x, y = nudges[a.desc]
        a.dx = x
        a.dy = y

root = Canvas()
text = TextPlotter(fig)
text.Write("EPI300", x=0, y=0, size=12)
panel = root.NewPanel(Transform(sx=8/6)+Transform(sx=HEIGHT/WIDTH, sy=1))
def text_xy(x, y):
    return panel.ApplyTransforms(np.array([[x, y]]))[0]
brush_main = Brush(BLACK)

rad_chr = 0.3

triangle_w = 0.02
brush_main._pts.append(np.array([
    [0, rad_chr],
    [0+triangle_w/2, rad_chr+triangle_w],
    [0-triangle_w/2, rad_chr+triangle_w],
]))
brush_main._cmds.append("MLL")
x, y = text_xy(0, rad_chr+triangle_w)
text.Write("oriC", x, y, size=12, color=BLACK.color_value, yanchor="bottom")

_w = 0.005
brush_main.EllipticalArc(x_rad=rad_chr+_w/2, end_angle=2*np.pi, width=_w)
brushes: dict[str, Brush] = {}
for a in annotations:
    c = a.color
    k =  str(c)
    if k in brushes: continue
    brushes[k] = Brush(c)

for a in annotations:
    l_start = rad_chr + a.rad_start
    l_end = rad_chr + a.rad_end
    l_c = (l_start+l_end)/2
    l_w = abs(l_end-l_start)
    r = a.GetAngle()
    x, y = to_cart(r, l_end)
    x, y = text_xy(x, y)
    left = x < 0
    inside = a.rad_end < 0
    aln = "left" if left == inside else "right"
    if a.desc:
        text.Write(
            a.desc, x+a.dx, y+0.015+a.dy, size=a.fontsize, color=BLACK.color_value,
            xanchor=aln, yanchor="top",
            align=aln,
        )

    b = brushes[str(a.color)]
    b.EllipticalArc(
        x_rad=l_c,
        start_angle=r,
        end_angle=r+a.r,
        width=l_w,
        resolution=1,
    )

brush_gcp = Brush(Palettes.PLOTLY[2])
brush_gcn = Brush(Palettes.PLOTLY[1])
# brush_gcp = Brush(BLACK)
# brush_gcn = Brush(BLACK)
brushes["gcp"] = brush_gcp
brushes["gcn"] = brush_gcn
STEP = 10000
_a = An("", 0)
dr = _a.GetAngle()
_a.location=STEP
dr = abs(dr-_a.GetAngle())
_skew = np.hstack((gc_skew, gc_skew[:1]))
W = 0.2
_w = 0.0005
R = 0.2
l_c = rad_chr-0.09-(abs(0.5-gc_skew.max())*W/R)
brush_main.EllipticalArc(x_rad=l_c+W/4, end_angle=2*np.pi, width=_w)
brush_main.EllipticalArc(x_rad=l_c-W/4, end_angle=2*np.pi, width=_w)
for i, v in enumerate(_skew):
    _a.location = i*STEP
    r = _a.GetAngle()
    if v>=0.5:
        b = brush_gcp
    else:
        b = brush_gcn
    w = (0.5-v)*W/R
    m = l_c-w/2
    b.EllipticalArc(
        x_rad=m,
        start_angle=r,
        end_angle=r+dr,
        width=abs(w),
        resolution=1,
    )

panel.AddElement(brush_main)
x, y = -0.5, -1/4
for (k, b), label in zip(brushes.items(), ["CDS", "tRNA", "rRNA", f"GC content, lines at 50±{R/2:.0%}", ""]):
    w = 0.005
    b.Line(x, y, x+0.05, y, w=w)
    if k != "gcn":
        tx, ty = text_xy(x+0.06, y)
        if k == "gcp": ty -= w/2
        text.Write(label, tx, ty, size=12, xanchor="left")
    y-= w if k == "gcp" else 0.03
    panel.AddElement(b)

fig = root.Render(fig=fig, debug=False)
da = dict(showticklabels=False, ticks=None, linecolor=COLORS.TRANSPARENT)
fig = ApplyTemplate(
    fig, 
    default_xaxis=da, default_yaxis=da,
    layout=dict(
        font=dict(
            family="Arial",
        ),
        width=WIDTH, height=HEIGHT,
    ),
)
fig.write_image(ws/"Figure1_epi300_genotype.svg")
fig.show(config=dict(
    scrollZoom=False
))

In [22]:
gc_skew.min(), gc_skew.max(), 

(np.float64(0.41695), np.float64(0.5592))

In [23]:
from pathlib import Path


Path(".").absolute()

PosixPath('/home/tony/workspace/projects/EPI300_genome_announcement/main/notebooks')

In [24]:
a = np.array(list(gene2loc.values()))
a.sort()
b = a[:-1]
a = a[1:]
(a-b).max()

np.int64(98781)