# Create the amplicon to reference file

In [1]:
import re

import Bio.SeqIO

import pandas as pd

# full length ORFs
orf_seqs = {}
for s in Bio.SeqIO.parse("../flu-CA09.gb", "genbank"):
    cds_feature = [f for f in s.features if f.type == "CDS"]
    assert len(cds_feature) >= 1, f"{s=}\n{s.features=}"
    cds_feature = cds_feature[0]
    orf_seqs[s.id] = str(cds_feature.extract(s).seq)
    assert orf_seqs[s.id].startswith("ATG")

# sequences for PacBio
seqs = list(Bio.SeqIO.parse("fluCA09.gb", "genbank"))

expected_features_of_interest = ["parsed_ORF_1", "parsed_ORF_2", "termini5", "parsed_termini3"]

dfs = []

for seq in seqs:
    target = seq.id
    print(f"\n{target=}")
    transcript = "flu" + re.match("[A-Z\d]+", target).group(0)
    print(f"{transcript=}")
    gene = transcript
    print(f"{gene=}")
    features = {
        f.type: {
            "start": int(f.location.start),
            "end": int(f.location.end),
            "seq": str(f.extract(seq).seq),
        }
        for f in seq.features
    }
    gene_start = features["termini5"]["start"]
    orf_start = features["ORF_2"]["start"]
    assert orf_start > gene_start
    seqstr = str(seq.seq)
    print(f"{gene_start=}, {orf_start=}")
    
    features_of_interest = []
    for f in expected_features_of_interest:
        if f not in features:
            print(f"Missing {f}")
        else:
            features_of_interest.append(f)
            print(f"{f} spans {features[f]['start']} to {features[f]['end']}")
        
    data = {
        "target": target,
        "transcript": transcript,
        "gene": gene,
        "wt_nt": [],
        "ORF_position": [],
        **{f: [] for f in features_of_interest},
    }
    
    # sites downstream of start of termini5
    i_orf = gene_start - orf_start
    for i in range(gene_start, len(seqstr)):
        data["wt_nt"].append(seqstr[i])
        data["ORF_position"].append(i_orf)
        if i_orf > 0 and i_orf < len(orf_seqs[gene]):
            assert seqstr[i] == orf_seqs[gene][i_orf - 1], f"{i_orf=}, {seqstr[i]=}, {orf_seqs[gene][i_orf - 1]=}"
        i_orf += 1  # increment if not on last time through loop
        if i_orf == 0:
            i_orf = 1
        for f in features_of_interest:
            start = features[f]["start"]
            if start <= i < features[f]["end"]:
                data[f].append(i - start + 1)
            else:
                data[f].append(None)

    # sites upstream of start of termini5
    # note that primer_binding_site_1 is duplicate of primer_binding_site_2
    start_after_binding_site = features["parsed_ORF_1"]["start"]
    for i_target in range(start_after_binding_site, gene_start):
        wt = seqstr[i_target]
        data["wt_nt"].append(wt)
        data["ORF_position"].append(i_orf)
        assert i_orf > 0
        if i_orf > 0 and i_orf < len(orf_seqs[gene]):
            assert wt == orf_seqs[gene][i_orf - 1], f"{wt=}, {orf_seqs[gene][i_orf - 1]=}, {i_orf=}"
        i_orf += 1
        for f in features_of_interest:
            start = features[f]["start"]
            if start <= i_target < features[f]["end"]:
                data[f].append(i_target - start + 1)
            else:
                data[f].append(None)

    dfs.append(pd.DataFrame(data))
    
df = pd.concat(dfs, ignore_index=True)
for f in expected_features_of_interest:
    df[f] = df[f].astype("Int64")


target='NPmid'
transcript='fluNP'
gene='fluNP'
gene_start=849, orf_start=894
parsed_ORF_1 spans 24 to 770
parsed_ORF_2 spans 894 to 1621
termini5 spans 849 to 894
parsed_termini3 spans 770 to 771

target='PAtermini'
transcript='fluPA'
gene='fluPA'
gene_start=2258, orf_start=2282
parsed_ORF_1 spans 25 to 2144
parsed_ORF_2 spans 2282 to 2289
termini5 spans 2258 to 2282
parsed_termini3 spans 2144 to 2176

target='PAmid'
transcript='fluPA'
gene='fluPA'
gene_start=1234, orf_start=1258
parsed_ORF_1 spans 32 to 1120
parsed_ORF_2 spans 1258 to 2289
termini5 spans 1234 to 1258
parsed_termini3 spans 1120 to 1152

target='HAtermini'
transcript='fluHA'
gene='fluHA'
gene_start=1979, orf_start=2084
parsed_ORF_1 spans 31 to 1694
parsed_ORF_2 spans 2084 to 2091
termini5 spans 1979 to 2084
parsed_termini3 spans 1694 to 1898

target='NAmid'
transcript='fluNA'
gene='fluNA'
gene_start=1199, orf_start=1259
parsed_ORF_1 spans 25 to 935
parsed_ORF_2 spans 1259 to 1791
termini5 spans 1199 to 1259
parsed_term

In [2]:
df.to_csv("amplicon_to_reference.csv", index=False)