In [1]:
import numpy as np
import pandas as pd
import tszip

In [2]:
events_file = "../arg_postprocessing/sc2ts_v1_2023-02-21_pr_pp_mp_aph_bps_pango_dated_deletion_events.csv"
sc2ts_df = pd.read_csv(events_file)
sc2ts_df

Unnamed: 0.1,Unnamed: 0,start,node,length,max_inheritors,min_inheritors
0,0,203,13,1,1,1
1,1,222,13,1,1,1
2,2,203,5464,1,1,1
3,3,222,5464,1,1,1
4,4,203,6906,1,1,1
...,...,...,...,...,...,...
114147,114147,29781,2734055,1,5,5
114148,114148,29781,2738049,1,2,2
114149,114149,29781,2738855,1,2,2
114150,114150,29781,2743324,1,4,4


In [3]:
ts_file = "../data/sc2ts_viridian_v1.1.trees.tsz"
ts = tszip.decompress(ts_file)
num_samples_sc2ts = ts.num_samples
num_samples_sc2ts

2482157

In [4]:
# https://figshare.com/articles/figure/Identifying_featured_indels_associated_with_SARS-CoV-2_fitness/21834957
tabs1_url = "https://figshare.com/ndownloader/files/40119535"
tabs1_df = pd.read_excel(tabs1_url)
tabs1_df

Unnamed: 0,Name,Start,Length,Count,Gene,Frameshift/Non-frameshift
0,Del_10000_1,10000,1,1,ORF1ab-nsp4,frameshift
1,Del_10005_2,10005,2,1,ORF1ab-nsp4,frameshift
2,Del_10007_1,10007,1,4,ORF1ab-nsp4,frameshift
3,Del_10008_3,10008,3,1,ORF1ab-nsp4,non-frameshift
4,Del_10011_1,10011,1,3,ORF1ab-nsp4,frameshift
...,...,...,...,...,...,...
26760,Del_999_3,999,3,2,ORF1ab-nsp2,non-frameshift
26761,Del_9990_1,9990,1,1,ORF1ab-nsp4,frameshift
26762,Del_9992_1,9992,1,1,ORF1ab-nsp4,frameshift
26763,Del_9997_1,9997,1,1,ORF1ab-nsp4,frameshift


In [5]:
# "In total, 31,642,407 deletion records and 1,981,308 insertion records were detected 
# in 9,149,680 filtered genomic sequences after removing indel records in 5′ and 3′ untranslated regions (UTR), and 
# there were 26,765 different types of deletions and 21,054 different types of insertions (Table S1)."
num_samples_li = 9149680
tabs1_df["Frequency"] = np.round(tabs1_df["Count"] / num_samples_li * 100, decimals=4)
tabs1_df.head(5)

Unnamed: 0,Name,Start,Length,Count,Gene,Frameshift/Non-frameshift,Frequency
0,Del_10000_1,10000,1,1,ORF1ab-nsp4,frameshift,0.0
1,Del_10005_2,10005,2,1,ORF1ab-nsp4,frameshift,0.0
2,Del_10007_1,10007,1,4,ORF1ab-nsp4,frameshift,0.0
3,Del_10008_3,10008,3,1,ORF1ab-nsp4,non-frameshift,0.0
4,Del_10011_1,10011,1,3,ORF1ab-nsp4,frameshift,0.0


In [6]:
min_freq = 1    # Percentage
tabs1_df = tabs1_df.loc[tabs1_df["Frequency"] >= min_freq]\
    .sort_values(by="Count", ascending=False)\
    .reset_index(drop=True)
tabs1_df

Unnamed: 0,Name,Start,Length,Count,Gene,Frameshift/Non-frameshift,Frequency
0,Del_28362_9,28362,9,3939164,N,non-frameshift,43.0525
1,Del_28271_1,28271,1,3853692,Intergenic,frameshift,42.1183
2,Del_21765_6,21765,6,3179887,S,non-frameshift,34.7541
3,Del_11288_9,11288,9,3040576,ORF1ab-nsp6,non-frameshift,33.2315
4,Del_28248_6,28248,6,3026186,ORF8,non-frameshift,33.0742
5,Del_22029_6,22029,6,3010665,S,non-frameshift,32.9046
6,Del_11283_9,11283,9,2011934,ORF1ab-nsp6,non-frameshift,21.9891
7,Del_6513_3,6513,3,1984683,ORF1ab-nsp3,non-frameshift,21.6913
8,Del_21633_9,21633,9,1941962,S,non-frameshift,21.2244
9,Del_21987_9,21987,9,1918094,S,non-frameshift,20.9635


In [7]:
max_inheritors = []
for row in tabs1_df.itertuples():
    data = sc2ts_df[(sc2ts_df.start == row.Start) & (sc2ts_df.length == row.Length)]
    max_val = data.max_inheritors.max()
    if np.isnan(max_val):
        max_inheritors.append(np.nan)
    else:
        max_inheritors.append(max_val)
assert len(max_inheritors) == len(tabs1_df)

In [8]:
tabs1_df["sc2t_max_inheritors"] = np.array(max_inheritors)
tabs1_df["sc2t_frequency"] = np.round(tabs1_df["sc2t_max_inheritors"] / num_samples_sc2ts * 100, decimals=4)
tabs1_df[["Name", "Start", "Length", "Gene", "Count", "Frequency", "sc2t_max_inheritors", "sc2t_frequency"]]

Unnamed: 0,Name,Start,Length,Gene,Count,Frequency,sc2t_max_inheritors,sc2t_frequency
0,Del_28362_9,28362,9,N,3939164,43.0525,871774.0,35.1216
1,Del_28271_1,28271,1,Intergenic,3853692,42.1183,1118363.0,45.0561
2,Del_21765_6,21765,6,S,3179887,34.7541,339681.0,13.6849
3,Del_11288_9,11288,9,ORF1ab-nsp6,3040576,33.2315,290408.0,11.6998
4,Del_28248_6,28248,6,ORF8,3026186,33.0742,1114669.0,44.9073
5,Del_22029_6,22029,6,S,3010665,32.9046,1112410.0,44.8163
6,Del_11283_9,11283,9,ORF1ab-nsp6,2011934,21.9891,40.0,0.0016
7,Del_6513_3,6513,3,ORF1ab-nsp3,1984683,21.6913,340765.0,13.7286
8,Del_21633_9,21633,9,S,1941962,21.2244,532072.0,21.4359
9,Del_21987_9,21987,9,S,1918094,20.9635,,


In [9]:
latex_table = r'\begin{table}' + "\n" + \
r'\caption{Major indels.}' + "\n" + \
r'\label{tab:indels}' + "\n" + \
r'\begin{tabular}{rrcrrrr}' + "\n" + \
r'\toprule & \multicolumn{4}{c}{Li et al. (2023)} & \multicolumn{2}{c}{Sc2ts ARG} \\' + "\n" + \
r'\cmidrule(lr){1-5} \cmidrule(lr){6-7}' + "\n" + \
r'Start & Length & Region & Count & Frequency (\%) & Count & Frequency (\%)\\' + "\n" + \
r'\midrule' + "\n"

for row in tabs1_df.itertuples():
    sc2t_count = "N/A" if np.isnan(row.sc2t_max_inheritors) else f"{int(row.sc2t_max_inheritors)}"
    sc2t_freq = "N/A" if np.isnan(row.sc2t_frequency) else f"{row.sc2t_frequency:.4f}"
    latex_table += f"{row.Start} & {row.Length} & {row.Gene} & " + \
        f"{row.Count} & {row.Frequency:.4f} & " + \
        f"{sc2t_count} & {sc2t_freq}" + r' \\' + "\n"

latex_table += r'\bottomrule' + "\n" + \
r'\end{tabular}' + "\n" + \
r'\end{table}'

print(latex_table)

\begin{table}
\caption{Major indels.}
\label{tab:indels}
\begin{tabular}{rrcrrrr}
\toprule & \multicolumn{4}{c}{Li et al. (2023)} & \multicolumn{2}{c}{Sc2ts ARG} \\
\cmidrule(lr){1-5} \cmidrule(lr){6-7}
Start & Length & Region & Count & Frequency (\%) & Count & Frequency (\%)\\
\midrule
28362 & 9 & N & 3939164 & 43.0525 & 871774 & 35.1216 \\
28271 & 1 & Intergenic & 3853692 & 42.1183 & 1118363 & 45.0561 \\
21765 & 6 & S & 3179887 & 34.7541 & 339681 & 13.6849 \\
11288 & 9 & ORF1ab-nsp6 & 3040576 & 33.2315 & 290408 & 11.6998 \\
28248 & 6 & ORF8 & 3026186 & 33.0742 & 1114669 & 44.9073 \\
22029 & 6 & S & 3010665 & 32.9046 & 1112410 & 44.8163 \\
11283 & 9 & ORF1ab-nsp6 & 2011934 & 21.9891 & 40 & 0.0016 \\
6513 & 3 & ORF1ab-nsp3 & 1984683 & 21.6913 & 340765 & 13.7286 \\
21633 & 9 & S & 1941962 & 21.2244 & 532072 & 21.4359 \\
21987 & 9 & S & 1918094 & 20.9635 & N/A & N/A \\
22194 & 3 & S & 1833050 & 20.0340 & 312706 & 12.5982 \\
21991 & 3 & S & 912944 & 9.9779 & 288831 & 11.6363 \\
686 & 9 & 