In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import tszip

In [2]:
regions = [
    ("ORF1AB", 266, 21555),
    ("S", 21563, 25384),
    ("ORF3", 25393, 26220),
    ("E", 26245, 26472),
    ("M", 26523, 27191),
    ("ORF6", 27202, 27387),
    ("ORF7a", 27394, 27759),
    ("ORF7b", 27756, 27887),
    ("ORF8", 27894, 28259),
    ("N", 28274, 29533),
    ("ORF10", 29558, 29674)]

def get_genome_region(pos): 
    for label, start, end in regions:
        if start <= pos <= end:
            return label
    return "non-coding"


ORF1AB_regions = [
    ("NSP1", 266, 805),
    ("NSP2", 806, 2719),
    ("NSP3", 2720, 8554),
    ("NSP4", 8555, 10054),
    ("NSP5", 10055, 10972),
    ("NSP6", 10973, 11842),
    ("NSP7", 11843, 12091),
    ("NSP8", 12092, 12685),
    ("NSP9", 12686, 13024),
    ("NSP10", 13025, 13441),
    ("NSP12", 13441, 16236),  # frameshift happens here
    ("NSP13", 16237, 18039),
    ("NSP14", 18040, 19620),
    ("NSP15", 19621, 20658),
    ("NSP16", 20659, 21552)
]

def get_ORF1AB_region(pos):
    for label, start, end in ORF1AB_regions:
        if start <= pos <= end:
            return label
    return ""


spike_subregions = [
    ("S1 - NTD - signal peptide", 21563, 21601),
    ("S1 - NTD", 21563, 22475),
    ("S1 - RBD", 22517, 23185),
    ("S1/2 cleavage site", 22517, 23185),
    ("S1 - other", 21563, 23617),
    ("S2 - Fusion peptide", 23924, 23980),
    ("S2 - HR1", 24296, 24514),
    ("HR2", 25049, 25201),
    ("TM domain", 25202, 25273),
    ("CTD", 25274, 25383),
    ("S2 - other", 23618, 25383)]

def get_spike_domain(pos):
    for label, start, end in spike_subregions:
        if start <= pos <= end:
            return label
    return ""


In [3]:
data_dir = Path("../data")
ts_file = data_dir / "sc2ts_viridian_v1.1.trees.tsz"
ts = tszip.load(ts_file)

In [4]:
pp_dir = Path("../arg_postprocessing")
events_file = pp_dir / "sc2ts_v1_2023-02-21_pr_pp_mp_aph_bps_pango_dated_deletion_events.csv"
events_df = pd.read_csv(events_file, index_col=0)
events_df["region"] = events_df["start"].apply(get_genome_region)
events_df["ORF1AB_region"] = events_df["start"].apply(get_ORF1AB_region)
events_df["spike_region"] = events_df["start"].apply(get_spike_domain)
events_df["pango"] = events_df["node"].apply(lambda x: ts.node(x).metadata["pango"])
events_df["deletion"] = events_df.apply(lambda row: tuple(range(row["start"], row["start"] + row["length"])), axis=1)
events_df = events_df.sort_values(by = "min_inheritors", ascending = False)
events_df

Unnamed: 0,start,node,length,max_inheritors,min_inheritors,region,ORF1AB_region,spike_region,pango,deletion
30086,28271,1436808,1,1118363,1118363,non-coding,,,B.1,"(28271,)"
27869,28248,220186,6,1114669,1114326,ORF8,,,B.1.617.2,"(28248, 28249, 28250, 28251, 28252, 28253)"
14712,22029,200039,6,1112410,1111939,S,,S1 - NTD,B.1.617.2,"(22029, 22030, 22031, 22032, 22033, 22034)"
8248,11288,1436802,4,871782,871762,ORF1AB,NSP6,,B.1.1.529,"(11288, 11289, 11290, 11291)"
8249,28362,1436802,9,871774,850272,N,,,B.1.1.529,"(28362, 28363, 28364, 28365, 28366, 28367, 283..."
...,...,...,...,...,...,...,...,...,...,...
43823,29541,909260,1,1,1,non-coding,,,AY.103,"(29541,)"
43822,29781,909255,1,1,1,non-coding,,,AY.103,"(29781,)"
43821,29779,909255,1,1,1,non-coding,,,AY.103,"(29779,)"
43820,29774,909255,1,1,1,non-coding,,,AY.103,"(29774,)"


In [5]:
major_events_df = events_df[events_df["max_inheritors"] > 10_000].reset_index(drop=True)
major_events_df["sc2ts_freq"] = np.round(major_events_df["max_inheritors"] / ts.num_samples * 100, decimals=4)
major_events_df

Unnamed: 0,start,node,length,max_inheritors,min_inheritors,region,ORF1AB_region,spike_region,pango,deletion,sc2ts_freq
0,28271,1436808,1,1118363,1118363,non-coding,,,B.1,"(28271,)",45.0561
1,28248,220186,6,1114669,1114326,ORF8,,,B.1.617.2,"(28248, 28249, 28250, 28251, 28252, 28253)",44.9073
2,22029,200039,6,1112410,1111939,S,,S1 - NTD,B.1.617.2,"(22029, 22030, 22031, 22032, 22033, 22034)",44.8163
3,11288,1436802,4,871782,871762,ORF1AB,NSP6,,B.1.1.529,"(11288, 11289, 11290, 11291)",35.122
4,28362,1436802,9,871774,850272,N,,,B.1.1.529,"(28362, 28363, 28364, 28365, 28366, 28367, 283...",35.1216
5,21633,822854,9,532072,531990,S,,S1 - NTD,BA.2,"(21633, 21634, 21635, 21636, 21637, 21638, 216...",21.4359
6,11292,822854,5,531894,531867,ORF1AB,NSP6,,BA.2,"(11292, 11293, 11294, 11295, 11296)",21.4287
7,6513,851246,3,340765,340751,ORF1AB,NSP3,,BA.1,"(6513, 6514, 6515)",13.7286
8,11283,851246,5,339867,339865,ORF1AB,NSP6,,BA.1,"(11283, 11284, 11285, 11286, 11287)",13.6924
9,21765,851246,6,339681,339472,S,,S1 - NTD,BA.1,"(21765, 21766, 21767, 21768, 21769, 21770)",13.6849


In [6]:
# https://figshare.com/articles/figure/Identifying_featured_indels_associated_with_SARS-CoV-2_fitness/21834957
tabs1_url = "https://figshare.com/ndownloader/files/40119535"
tabs1_df = pd.read_excel(tabs1_url)

# "In total, 31,642,407 deletion records and 1,981,308 insertion records were detected 
# in 9,149,680 filtered genomic sequences after removing indel records in 5′ and 3′ untranslated regions (UTR), and 
# there were 26,765 different types of deletions and 21,054 different types of insertions (Table S1)."
num_samples_li = 9149680
tabs1_df["Frequency"] = np.round(tabs1_df["Count"] / num_samples_li * 100, decimals=4)
tabs1_df.head(5)

Unnamed: 0,Name,Start,Length,Count,Gene,Frameshift/Non-frameshift,Frequency
0,Del_10000_1,10000,1,1,ORF1ab-nsp4,frameshift,0.0
1,Del_10005_2,10005,2,1,ORF1ab-nsp4,frameshift,0.0
2,Del_10007_1,10007,1,4,ORF1ab-nsp4,frameshift,0.0
3,Del_10008_3,10008,3,1,ORF1ab-nsp4,non-frameshift,0.0
4,Del_10011_1,10011,1,3,ORF1ab-nsp4,frameshift,0.0


In [7]:
major_events_df = major_events_df.merge(
    tabs1_df,
    how="left",
    left_on=["start", "length"],
    right_on=["Start", "Length"],
)[[
    "start", "length",
    "region", "ORF1AB_region", "spike_region",
    "node", "pango",
    "max_inheritors", "sc2ts_freq",
    "Count", "Frequency",
]]

In [8]:
latex_table_major_dels = r'\begin{table}' + "\n" + \
r'\caption{Major deletion events.}' + "\n" + \
r'\label{tab:major_dels}' + "\n" + \
r'\begin{tabular}{rrcrcrrrr}' + "\n" + \
r'\centering' + "\n" + \
r'\toprule & \multicolumn{6}{c}{Sc2ts ARG} & \multicolumn{2}{c}{Li et al. (2023)} \\' + "\n" + \
r'\cmidrule(lr){1-7} \cmidrule(lr){8-9}' + "\n" + \
r'Start & Length & Region & Node & Pango & Count & Frequency (\%) & Count & Frequency (\%)\\' + "\n" + \
r'\midrule' + "\n"

for row in major_events_df.itertuples():
    sc2ts_count = "N/A" if np.isnan(row.max_inheritors) else f"{int(row.max_inheritors)}"
    sc2ts_freq = "N/A" if np.isnan(row.sc2ts_freq) else f"{row.sc2ts_freq:.4f}"
    li_count = "N/A" if np.isnan(row.Count) else f"{int(row.Count)}"
    li_freq = "N/A" if np.isnan(row.Frequency) else f"{row.Frequency:.4f}"
    region = row.region + " / " + row.ORF1AB_region if row.ORF1AB_region != "" else row.region
    region += " / " + row.spike_region if row.spike_region != "" else ""
    latex_table_major_dels += f"{row.start} & {row.length} & {region} & {row.node} & {row.pango} & " + \
        f"{sc2ts_count} & {sc2ts_freq} & " + \
        f"{li_count} & {li_freq}" + r' \\' + "\n"

latex_table_major_dels += r'\bottomrule' + "\n" + \
r'\end{tabular}' + "\n" + \
r'\end{table}'

print(latex_table_major_dels)

\begin{table}
\caption{Major deletion events.}
\label{tab:major_dels}
\begin{tabular}{rrcrcrrrr}
\centering
\toprule & \multicolumn{6}{c}{Sc2ts ARG} & \multicolumn{2}{c}{Li et al. (2023)} \\
\cmidrule(lr){1-7} \cmidrule(lr){8-9}
Start & Length & Region & Node & Pango & Count & Frequency (\%) & Count & Frequency (\%)\\
\midrule
28271 & 1 & non-coding & 1436808 & B.1 & 1118363 & 45.0561 & 3853692 & 42.1183 \\
28248 & 6 & ORF8 & 220186 & B.1.617.2 & 1114669 & 44.9073 & 3026186 & 33.0742 \\
22029 & 6 & S / S1 - NTD & 200039 & B.1.617.2 & 1112410 & 44.8163 & 3010665 & 32.9046 \\
11288 & 4 & ORF1AB / NSP6 & 1436802 & B.1.1.529 & 871782 & 35.1220 & N/A & N/A \\
28362 & 9 & N & 1436802 & B.1.1.529 & 871774 & 35.1216 & 3939164 & 43.0525 \\
21633 & 9 & S / S1 - NTD & 822854 & BA.2 & 532072 & 21.4359 & 1941962 & 21.2244 \\
11292 & 5 & ORF1AB / NSP6 & 822854 & BA.2 & 531894 & 21.4287 & N/A & N/A \\
6513 & 3 & ORF1AB / NSP3 & 851246 & BA.1 & 340765 & 13.7286 & 1984683 & 21.6913 \\
11283 & 5 & ORF1A

Table 1: Deletion events which have more than 10k inheritors. Of these 17 deletions, eight are located in the N-terminal domain of Spike, a previously noted site for key deletions. Many of these nodes represent the origins of major lineages, for example the 851246 node which represents the origin of BA.1, and the 86456 node which represents the origin of Alpha (see Table 2).

In [9]:
alpha_dels_df = events_df[events_df["node"] == 86456]
alpha_dels_df

Unnamed: 0,start,node,length,max_inheritors,min_inheritors,region,ORF1AB_region,spike_region,pango,deletion
8218,28271,86456,1,290446,290446,non-coding,,,B.1.1.7,"(28271,)"
8215,11288,86456,9,290408,290402,ORF1AB,NSP6,,B.1.1.7,"(11288, 11289, 11290, 11291, 11292, 11293, 112..."
8217,21991,86456,3,288831,288192,S,,S1 - NTD,B.1.1.7,"(21991, 21992, 21993)"
8216,21765,86456,6,290438,287464,S,,S1 - NTD,B.1.1.7,"(21765, 21766, 21767, 21768, 21769, 21770)"


In [10]:
latex_table_alpha_dels = r'\begin{table}' + "\n" + \
r'\caption{Deletion events associated with the Alpha variant.}' + "\n" + \
r'\label{tab:alpha_dels}' + "\n" + \
r'\centering' + "\n" + \
r'\begin{tabular}{rrrrr}' + "\n" + \
r'\toprule' + "\n" + \
r'Start & Length & Region & Count & Frequency (\%)\\' + "\n" + \
r'\midrule' + "\n"

for row in alpha_dels_df.itertuples():
    region = row.region + " / " + row.ORF1AB_region if row.ORF1AB_region != "" else row.region
    region += " / " + row.spike_region if row.spike_region != "" else ""
    sc2ts_count = f"{int(row.max_inheritors)}"
    sc2ts_freq = f"{row.max_inheritors / ts.num_samples * 100:.4f}"
    latex_table_alpha_dels += f"{row.start} & {row.length} & {region} & " + \
        f"{sc2ts_count} & {sc2ts_freq}" + r' \\' + "\n"

latex_table_alpha_dels += r'\bottomrule' + "\n" + \
r'\end{tabular}' + "\n" + \
r'\end{table}'

print(latex_table_alpha_dels)

\begin{table}
\caption{Deletion events associated with the Alpha variant.}
\label{tab:alpha_dels}
\centering
\begin{tabular}{rrrrr}
\toprule
Start & Length & Region & Count & Frequency (\%)\\
\midrule
28271 & 1 & non-coding & 290446 & 11.7014 \\
11288 & 9 & ORF1AB / NSP6 & 290408 & 11.6998 \\
21991 & 3 & S / S1 - NTD & 288831 & 11.6363 \\
21765 & 6 & S / S1 - NTD & 290438 & 11.7010 \\
\bottomrule
\end{tabular}
\end{table}


Table 2: Deletions occuring on the node leading to the alpha variant. Two of these deletions are in the N-terminal domain of spike, with a further one in the non-coding region located between ORF8 and N, and another in NSP1. 

In [11]:
rec_dels_df = events_df[["start", "length", "region", "ORF1AB_region", "spike_region"]]\
    .value_counts()\
    .reset_index(name="occurrences")[:20]
rec_dels_df

Unnamed: 0,start,length,region,ORF1AB_region,spike_region,occurrences
0,29781,1,non-coding,,,12713
1,29762,1,non-coding,,,11356
2,29779,1,non-coding,,,10925
3,29769,1,non-coding,,,10783
4,29774,1,non-coding,,,10707
5,29700,1,non-coding,,,8543
6,29614,1,ORF10,,,5041
7,29555,1,non-coding,,,4816
8,29543,1,non-coding,,,4806
9,29541,1,non-coding,,,4804


In [12]:
latex_table_rec_dels = r'\begin{table}' + "\n" + \
r'\caption{Highly recurrent deletions.}' + "\n" + \
r'\label{tab:rec_dels}' + "\n" + \
r'\centering' + "\n" + \
r'\begin{tabular}{rrcrr}' + "\n" + \
r'\toprule' + "\n" + \
r'Start & Length & Region & Occurrences \\' + "\n" + \
r'\midrule' + "\n"

for row in rec_dels_df.itertuples():
    region = row.region + " / " + row.ORF1AB_region if row.ORF1AB_region != "" else row.region
    region += " / " + row.spike_region if row.spike_region != "" else ""
    latex_table_rec_dels += f"{row.start} & {row.length} & {region} & " + \
        f"{row.occurrences}" + r' \\' + "\n"

latex_table_rec_dels += r'\bottomrule' + "\n" + \
r'\end{tabular}' + "\n" + \
r'\end{table}'

print(latex_table_rec_dels)

\begin{table}
\caption{Highly recurrent deletions.}
\label{tab:rec_dels}
\centering
\begin{tabular}{rrcrr}
\toprule
Start & Length & Region & Occurrences \\
\midrule
29781 & 1 & non-coding & 12713 \\
29762 & 1 & non-coding & 11356 \\
29779 & 1 & non-coding & 10925 \\
29769 & 1 & non-coding & 10783 \\
29774 & 1 & non-coding & 10707 \\
29700 & 1 & non-coding & 8543 \\
29614 & 1 & ORF10 & 5041 \\
29555 & 1 & non-coding & 4816 \\
29543 & 1 & non-coding & 4806 \\
29541 & 1 & non-coding & 4804 \\
203 & 1 & non-coding & 3602 \\
222 & 1 & non-coding & 3441 \\
22013 & 6 & S / S1 - NTD & 2111 \\
21991 & 3 & S / S1 - NTD & 2087 \\
28254 & 1 & ORF8 & 1967 \\
27556 & 64 & ORF7a & 1281 \\
27579 & 3 & ORF7a & 1014 \\
28093 & 1 & ORF8 & 898 \\
21765 & 6 & S / S1 - NTD & 478 \\
27532 & 1 & ORF7a & 403 \\
\bottomrule
\end{tabular}
\end{table}


Table 3: The most recurrent deletions in the ARG. The majority of these highly recurent deletions are located in the non-coding region, but some are in coding regions such as 21991-21993 and 22013-22018 in the Spike protein. The 898 2bp deletion at position 21765 seems to be artifactual as it overwhelmingly occurs in the alpha variant, which has a longer 21765-21770 deletion as a defining deletion occuring in the node originating the lineage (node 86456; table 2).