In [7]:
import pandas as pd
import tszip

In [2]:
regions = [
    ("ORF1AB", 266, 21555),
    ("S", 21563, 25384),
    ("ORF3", 25393, 26220),
    ("E", 26245, 26472),
    ("M", 26523, 27191),
    ("ORF6", 27202, 27387),
    ("ORF7a", 27394, 27759),
    ("ORF7b", 27756, 27887),
    ("ORF8", 27894, 28259),
    ("N", 28274, 29533),
    ("ORF10", 29558, 29674)]

def get_genome_region(pos): 
    for label, start, end in regions:
        if start <= pos <= end:
            return label
    return "non-coding"

ORF1AB_regions = [
    ("NSP1", 266, 805),
    ("NSP2", 806, 2719),
    ("NSP3", 2720, 8554),
    ("NSP4", 8555, 10054),
    ("NSP5", 10055, 10972),
    ("NSP6", 10973, 11842),
    ("NSP7", 11843, 12091),
    ("NSP8", 12092, 12685),
    ("NSP9", 12686, 13024),
    ("NSP10", 13025, 13441),
    ("NSP12", 13441, 16236),  # frameshift happens here
    ("NSP13", 16237, 18039),
    ("NSP14", 18040, 19620),
    ("NSP15", 19621, 20658),
    ("NSP16", 20659, 21552)
]

def get_ORF1AB_region(pos):
    for label, start, end in ORF1AB_regions:
        if start <= pos <= end:
            return label
    return ""

spike_subregions = [
    ("S1 - NTD - signal peptide", 21563, 21601),
    ("S1 - NTD", 21563, 22475),
    ("S1 - RBD", 22517, 23185),
    ("S1/2 cleavage site", 22517, 23185),
    ("S1 - other", 21563, 23617),
    ("S2 - Fusion peptide", 23924, 23980),
    ("S2 = HR1", 24296, 24514),
    ("HR2", 25049, 25201),
    ("TM domain", 25202, 25273),
    ("CTD", 25274, 25383),
    ("S2 - other", 23618, 25383)
]

def get_spike_domain(pos):
    for label, start, end in spike_subregions:
        if start <= pos <= end:
            return label
    return ""

In [3]:

ts = tszip.load("sc2ts_v1_2023-02-21_pp_dels_bps_pango_dated_mmps.trees.tsz")

ARG_deletion_events = pd.read_csv('sc2ts_v1_2023-02-21_pp_dated_remapped_deletion_events.csv', index_col=0)
ARG_deletion_events["region"] = ARG_deletion_events["start"].apply(get_genome_region)
ARG_deletion_events["ORF1AB_region"] = ARG_deletion_events["start"].apply(get_ORF1AB_region)
ARG_deletion_events["spike_region"] = ARG_deletion_events["start"].apply(get_spike_domain)
ARG_deletion_events["pango"] = ARG_deletion_events["node"].apply(lambda x: ts.node(x).metadata["pango"])
ARG_deletion_events["deletion"] = ARG_deletion_events.apply(lambda row: tuple(range(row["start"], row["start"] + row["length"])), axis=1)
ARG_deletion_events = ARG_deletion_events.sort_values(by = "min_inheritors", ascending = False)

In [23]:
ARG_deletion_events[ARG_deletion_events["max_inheritors"] > 10000].reset_index()

Unnamed: 0,index,start,node,length,max_inheritors,min_inheritors,region,ORF1AB_region,spike_region,in_frame,pango,deletion
0,43175,28271,166394,1,1118363,1118363,non-coding,,,False,B.1.617,"(28271,)"
1,39796,28248,220186,6,1114666,1114283,ORF8,,,True,B.1.617.2,"(28248, 28249, 28250, 28251, 28252, 28253)"
2,20180,22029,200039,6,1112391,1111861,S,,S1 - NTD,True,B.1.617.2,"(22029, 22030, 22031, 22032, 22033, 22034)"
3,9396,28362,851246,9,871774,850212,N,,,True,BA.1,"(28362, 28363, 28364, 28365, 28366, 28367, 283..."
4,10059,21633,822854,9,532028,531938,S,,S1 - NTD,True,BA.2,"(21633, 21634, 21635, 21636, 21637, 21638, 216..."
5,10058,11288,822854,9,530759,530716,ORF1AB,NSP6,,True,BA.2,"(11288, 11289, 11290, 11291, 11292, 11293, 112..."
6,9392,11283,851246,9,341023,339863,ORF1AB,NSP6,,True,BA.1,"(11283, 11284, 11285, 11286, 11287, 11288, 112..."
7,9393,21765,851246,6,339680,339462,S,,S1 - NTD,True,BA.1,"(21765, 21766, 21767, 21768, 21769, 21770)"
8,9394,21988,851246,8,339654,339379,S,,S1 - NTD,False,BA.1,"(21988, 21989, 21990, 21991, 21992, 21993, 219..."
9,9391,6513,851246,3,339251,339242,ORF1AB,NSP3,,True,BA.1,"(6513, 6514, 6515)"


Table 1: Deletion events which have more than 10k inheritors. Of these 17 deletions, eight are located in the N-terminal domain of Spike, a previously noted site for key deletions. Many of these nodes represent the origins of major lineages, for example the 851246 node which represents the origin of BA.1, and the 86456 node which represents the origin of Alpha (see table 2). 

In [21]:
ARG_deletion_events[ARG_deletion_events["node"] == 86456]

Unnamed: 0,start,node,length,max_inheritors,min_inheritors,region,ORF1AB_region,spike_region,in_frame,pango,deletion
10005,28271,86456,1,290446,290446,non-coding,,,False,B.1.1.7,"(28271,)"
10002,11288,86456,9,290408,290401,ORF1AB,NSP6,,True,B.1.1.7,"(11288, 11289, 11290, 11291, 11292, 11293, 112..."
10004,21991,86456,3,288763,288118,S,,S1 - NTD,True,B.1.1.7,"(21991, 21992, 21993)"
10003,21765,86456,6,290399,286828,S,,S1 - NTD,True,B.1.1.7,"(21765, 21766, 21767, 21768, 21769, 21770)"


Table 2: Deletions occuring on the node leading to the alpha variant. Two of these deletions are in the N-terminal domain of spike, with a further one in the non-coding region located between ORF8 and N, and another in NSP1. 

In [20]:
ARG_deletion_events[["start", "length", "region"]].value_counts().reset_index(name="number of occurrences")[:20]

Unnamed: 0,start,length,region,number of occurrences
0,29781,1,non-coding,20005
1,29762,1,non-coding,18098
2,29779,1,non-coding,16643
3,29769,1,non-coding,16411
4,29774,1,non-coding,16302
5,29700,1,non-coding,12982
6,29614,1,ORF10,6608
7,29555,1,non-coding,6319
8,29543,1,non-coding,6304
9,29541,1,non-coding,6302


Table 3: The most recurrent deletions in the ARG. The majority of these highly recurent deletions are located in the non-coding region, but some are in coding regions such as 21991-21993 and 22013-22018 in the Spike protein. The 898 2bp deletion at position 21765 seems to be artifactual as it overwhelmingly occurs in the alpha variant, which has a longer 21765-21770 deletion as a defining deletion occuring in the node originating the lineage node 86456; see table 2).