In [1]:
import os
from pathlib import Path
from Bio import SeqIO

from local.constants import WORKSPACE_ROOT

ws = Path("cache/epi300")
ws.mkdir(parents=True, exist_ok=True)

In [2]:
for e in SeqIO.parse(WORKSPACE_ROOT/"data/references/reference_genomes/k12mg1655/k12mg1655.gb", "genbank"):
    for feature in e.features:
        if feature.type != "rep_origin": continue
        print(feature)
        oriC = str(feature.extract(e.seq))
        print(oriC)

type: rep_origin
location: [3925743:3925975](+)
qualifiers:
    Key: note, Value: ['oriC']

GATCTATTTATTTAGAGATCTGTTCTATTGTGATCTCTTATTAGGATCGCACTGCCCTGTGGATAACAAGGATCCGGCTTTTAAGATCAACAACCTGGAAAGGATCATTAACTGTGAATGATCGGTGATCCTGGACCGTATAAGCTGGGATCAGAATGAGGGGTTATACACAACTCAAAAACTGAACAACAGTTGTTCTTTGGATAACTACCGGTTGATCCAAGCTTCCTGA


In [3]:
for e in SeqIO.parse(WORKSPACE_ROOT/"data/assembly/epi300.fna", "fasta"):
    epi300_desc = e.description
    epi300 = str(e.seq)
    print(epi300[:100])
    break

TGTTTCCCGGTGAGGACGGTTACAGCCGCAGCGAGTCACTGTGGCTGGTGCGCGGCGGCGTGGCGAAACTGGATGAAGGTCACCGGCTGGCCGCACTCTG


In [4]:
i = epi300.index(oriC)
epi300_aligned = epi300[i:] + epi300[:i]
print(epi300_aligned[:100])

GATCTATTTATTTAGAGATCTGTTCTATTGTGATCTCTTATTAGGATCGCACTGCCCTGTGGATAACAAGGATCCGGCTTTTAAGATCAACAACCTGGAA


In [5]:
epi_path = ws/"epi300_aligned.fna"
with open(epi_path, "w") as f:
    f.write(f">{epi300_desc}\n")
    f.write(epi300_aligned)

In [6]:
"123456"[:5]

'12345'

In [7]:
k12_orfs = WORKSPACE_ROOT/"data/references/reference_genomes/k12mg1655/k12mg1655.faa"
import pandas as pd

COLS = "qseqid sseqid qstart qend sstart send nident qlen slen".split(" ")
blast_result = ws/"temp.result.tsv"
if not blast_result.exists():
    os.system(f"""\
        cd {ws}
        makeblastdb \
            -dbtype nucl \
            -in {epi_path.resolve()} \
            -out db

        echo "blasting"
        tblastn \
            -num_threads 14 \
            -query {k12_orfs} \
            -outfmt "6 {' '.join(COLS)}" \
            -db db \
            -out {blast_result.name}
    """)

In [8]:
k12id2desc = {}
k122len = {}
for e in SeqIO.parse(k12_orfs, "fasta"):
    k12id2desc[e.id] = e.description
    k122len[e.id] = len(e.seq)

In [9]:
dfe = pd.read_csv(blast_result, sep="\t", header=None, names=COLS)
mapping = {}
for _, r in dfe.iterrows():
    nident = r.nident
    k, v = r.qseqid, (r.sstart, r.send)
    if k in mapping:
        _, prev_score, _ = mapping[k]
        if nident <= prev_score: continue
    mapping[k] = v, nident, r.qlen
len(mapping)

4283

In [10]:
len(k12id2desc)

4298

In [11]:
import numpy as np

one_shorts = []
changed = []
rows = []
for orf, desc in k12id2desc.items():
    if orf not in mapping:
        nident, length = 0, k122len[orf]
    else:
        _, nident, length = mapping[orf]
    if nident == length: continue
    if nident+1 == length:
        one_shorts.append(orf)
        continue
    B = 10
    pct = int(np.round(nident/length*100, 0))
    bars = int(np.round(nident/length*B, 0))
    changed.append((nident/length, f"{' 'if pct <10 else ''}{pct}% {'>' if pct > 0 else ' '}{'='*bars}{' '*(B-bars)} {desc} {length}"))
    rows.append((pct, desc, length))

# for k, ((s, e), nident, length) in mapping.items():
#     if nident == length: continue
#     if nident+1 == length:
#         one_shorts.append(k)
#         continue
#     B = 10
#     bars = int(np.round(nident/length*B, 0))
#     changed.append((nident/length, f"{'='*bars}{' '*(B-bars)} {k12id2desc[k]} {length}"))
changed = sorted(changed, key=lambda x: x[0])
for _, s in changed:
    print(s)

 0%             NP_414542.1 thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655] 21
 0%             NP_416114.2 periplasmic chaperone Asr [Escherichia coli str. K-12 substr. MG1655] 102
 0%             NP_417089.1 phe operon leader peptide [Escherichia coli str. K-12 substr. MG1655] 15
 0%             YP_001165328.2 membrane toxin DinQ [Escherichia coli str. K-12 substr. MG1655] 27
 0%             YP_001165331.1 membrane-depolarizing toxin TisB [Escherichia coli str. K-12 substr. MG1655] 29
 0%             YP_002791247.1 toxic peptide IbsA [Escherichia coli str. K-12 substr. MG1655] 19
 0%             YP_002791248.1 putative toxic peptide IbsB [Escherichia coli str. K-12 substr. MG1655] 18
 0%             YP_002791249.1 putative membrane protein YohP [Escherichia coli str. K-12 substr. MG1655] 27
 0%             YP_002791255.1 toxic peptide IbsC [Escherichia coli str. K-12 substr. MG1655] 19
 0%             YP_002791256.1 putative toxic peptide IbsD [Escherichia coli s

In [13]:
df = pd.DataFrame(rows, columns="nearest_match description length".split(" "))
df = df.sort_values(["nearest_match"])
df.to_csv(ws/"changed_at_least_2nt.epi300_vs_k12mg1655.csv", index=False)