# Data for the summary tables in the text

In [1]:
import sys
import collections
import pathlib

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Markdown

import tskit
import tszip

sys.path.append("../../sc2ts/")
import sc2ts.utils

pd.set_option("display.precision", 2)


In [2]:
ts_long_path = "../data/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo-gisaid-il.ts.tsz"
ts_wide_path = "../data/upgma-full-md-30-mm-3-2021-06-30-recinfo-gisaid-il.ts.tsz"

In [3]:
%%time
ts_long = tszip.decompress(ts_long_path)
ts_long

CPU times: user 572 ms, sys: 342 ms, total: 914 ms
Wall time: 568 ms


Tree Sequence,Unnamed: 1
Trees,958
Sequence Length,29904.0
Time Units,days
Sample Nodes,657239
Total Size,494.0 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,785539,24.0 MiB,
Individuals,0,24 Bytes,
Migrations,0,8 Bytes,
Mutations,1062072,40.2 MiB,✅
Nodes,783231,422.5 MiB,✅
Populations,0,8 Bytes,
Provenances,2,1.8 KiB,
Sites,29422,1.3 MiB,✅


In [4]:
%%time
ts_wide = tszip.decompress(ts_wide_path)
ts_wide

CPU times: user 865 ms, sys: 544 ms, total: 1.41 s
Wall time: 914 ms


Tree Sequence,Unnamed: 1
Trees,1496
Sequence Length,29904.0
Time Units,days
Sample Nodes,1265685
Total Size,910.7 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1458146,44.5 MiB,
Individuals,0,24 Bytes,
Migrations,0,8 Bytes,
Mutations,1213193,45.8 MiB,✅
Nodes,1453347,807.9 MiB,✅
Populations,0,8 Bytes,
Provenances,1,874 Bytes,
Sites,29422,1.4 MiB,✅


In [5]:
ti_long = sc2ts.utils.TreeInfo(ts_long)
ti_long

Counting descendants : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 783231/783231 [00:00<00:00, 2164595.32it/s]
Indexing metadata    : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 783231/783231 [00:12<00:00, 60920.06it/s]
Classifying mutations: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1062072/1062072 [00:09<00:00, 107883.19it/s]


Unnamed: 0,property,value
0,latest_sample,2022-06-30
1,max_submission_delay,29 days
2,samples,657239
3,nodes,783231
4,mc_nodes,37749
5,pr_nodes,34358
6,re_nodes,2078
7,recombinants,2078
8,mutations,1062072
9,recurrent,50099


In [6]:
ti_wide = sc2ts.utils.TreeInfo(ts_wide)
ti_wide

Counting descendants : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1453347/1453347 [00:00<00:00, 2272062.14it/s]
Indexing metadata    : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1453347/1453347 [00:23<00:00, 61115.10it/s]
Classifying mutations: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1213193/1213193 [00:11<00:00, 103600.86it/s]


Unnamed: 0,property,value
0,latest_sample,2021-06-30
1,max_submission_delay,29 days
2,samples,1265685
3,nodes,1453347
4,mc_nodes,40292
5,pr_nodes,40538
6,re_nodes,4123
7,recombinants,4123
8,mutations,1213193
9,recurrent,74719


# Node summaries

In [7]:
node_type_map = {
        "Sample": tskit.NODE_IS_SAMPLE,
        "Sample cluster": 0,
        "Reversion Push": sc2ts.NODE_IS_REVERSION_PUSH,
        "Mutation Collapse": sc2ts.NODE_IS_MUTATION_OVERLAP,
        "Recombinant": sc2ts.NODE_IS_RECOMBINANT,
    }


def summarise_nodes(ts):
    
    data = [{"property": "total", "count": ts.num_nodes, "percentage": 100}]
            
    for key, flags in node_type_map.items():
        num = np.sum(ts.nodes_flags == flags)
        data.append({"property": key, "count": num, "percentage": round((num / ts.num_nodes) * 100, 2)})
    return pd.DataFrame(data)
        
display(Markdown("## Wide ARG"))
summarise_nodes(ts_wide)


## Wide ARG

Unnamed: 0,property,count,percentage
0,total,1453347,100.0
1,Sample,1265685,87.09
2,Sample cluster,102709,7.07
3,Reversion Push,40538,2.79
4,Mutation Collapse,40292,2.77
5,Recombinant,4123,0.28


In [8]:

display(Markdown("## Long ARG"))
summarise_nodes(ts_long)

## Long ARG

Unnamed: 0,property,count,percentage
0,total,783231,100.0
1,Sample,657239,83.91
2,Sample cluster,51807,6.61
3,Reversion Push,34358,4.39
4,Mutation Collapse,37749,4.82
5,Recombinant,2078,0.27


# Mutations

In [9]:
def mutation_summary(ts):
    return ts.num_mutations, round(ts.num_mutations / ts.num_nodes, 2)

In [10]:
mutation_summary(ts_wide)

(1213193, 0.83)

In [11]:
mutation_summary(ts_long)

(1062072, 1.36)

# Edges

In [12]:
def edge_summary(ts):
    data = [
        {"property": "total", "count": ts.num_edges}
    ]
   
    for key, flags in node_type_map.items():
        num = np.sum(ts.nodes_flags[ts.edges_parent] == flags)
        data.append({"property": key, "count": num, "percentage": round((num / ts.num_edges) * 100, 2)})
    return pd.DataFrame(data)

edge_summary(ts_wide)

Unnamed: 0,property,count,percentage
0,total,1458146,
1,Sample,610729,41.88
2,Sample cluster,470545,32.27
3,Reversion Push,184608,12.66
4,Mutation Collapse,186218,12.77
5,Recombinant,6046,0.41


In [13]:
edge_summary(ts_long)

Unnamed: 0,property,count,percentage
0,total,785539,
1,Sample,319626,40.69
2,Sample cluster,156881,19.97
3,Reversion Push,144991,18.46
4,Mutation Collapse,160833,20.47
5,Recombinant,3208,0.41


# Mutation counts

In [14]:
def mutation_count_summary(ts):
    mutations_per_site = np.bincount(
            ts.mutations_site, minlength=ts.num_sites
    )
    mutations_per_node = np.bincount(
            ts.mutations_node, minlength=ts.num_nodes
    )
    mutations_per_sample = mutations_per_node[ts.samples()]
    data = []
    for k, array in zip(["site", "node", "sample"], 
                        [mutations_per_site, mutations_per_node,
                        mutations_per_sample]):
        data.append({"type": k, "mean": np.mean(array),
                     "stddev": np.std(array)}
                   )
    return pd.DataFrame(data)
    
    
mutation_count_summary(ts_wide)

Unnamed: 0,type,mean,stddev
0,site,41.23,108.16
1,node,0.83,1.4
2,sample,0.77,1.39


In [15]:
mutation_count_summary(ts_long)

Unnamed: 0,type,mean,stddev
0,site,36.1,80.03
1,node,1.36,1.72
2,sample,1.38,1.77


# File sizes

In [16]:
def strip_metadata(source_file, dest_file):
   
    ts = tszip.decompress(source_file)
    tables = ts.dump_tables()
    d = tables.nodes.asdict()
    del d["metadata"]
    del d["metadata_offset"]
    del d["metadata_schema"]
    tables.nodes.set_columns(**d)

    d = tables.mutations.asdict()
    del d["metadata"]
    del d["metadata_offset"]
    del d["metadata_schema"]
    tables.mutations.set_columns(**d)

    d = tables.sites.asdict()
    del d["metadata"]
    del d["metadata_offset"]
    del d["metadata_schema"]
    tables.sites.set_columns(**d)

    ts = tables.tree_sequence()
    tszip.compress(ts, dest_file)
    return ts

def no_metadata_size(path): 

    tmp = "tmp.ts.tsz"
    ts = strip_metadata(path, tmp)
    p = pathlib.Path(tmp)

    no_metadata_size = p.stat().st_size
    return round(no_metadata_size / ts.num_samples, 2)


In [17]:
no_metadata_size(ts_long_path)

10.83

In [18]:
%%bash -s "$ts_long_path"
ls -lh $1;

-rw-rw-r-- 1 jk jk 37M Apr 13 11:31 ../data/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo-gisaid-il.ts.tsz


In [19]:
no_metadata_size(ts_wide_path)

8.29

In [20]:
%%bash -s "$ts_wide_path"
ls -lh $1;

-rw-rw-r-- 1 jk jk 58M Apr 13 11:30 ../data/upgma-full-md-30-mm-3-2021-06-30-recinfo-gisaid-il.ts.tsz


# Mutation breakdown

Data for the breakdown of mutation types as shown in the figure 

In [37]:
def mutation_breakdown(ti):
    source = {}
    for _, row in ti.summary().iterrows():
        source[row.property] = row.value
    
    name_map = {
        "Recurrent": "recurrent",
        "Reversions": "reversions",
        "Private": "private_mutations", 
        "Transitions": "transitions", 
        "Transversions": "transversions", 
        "Insertions": "insertions", 
        "Deletions": "deletions"
    }
    
    num_mutations = source["mutations"]
    data = [{"property": "total", "count": num_mutations, "percentage": 100}]
    for display, key in name_map.items():
        data.append(
           {"property": display, "count": source[key], 
            "percentage": 100 * source[key] / num_mutations}) 
    return pd.DataFrame(data)



## Wide ARG

In [39]:
mutation_breakdown(ti_wide)

Unnamed: 0,property,count,percentage
0,total,1213193,100.0
1,Recurrent,74719,6.16
2,Reversions,72617,5.99
3,Private,758903,62.55
4,Transitions,873487,72.0
5,Transversions,326053,26.88
6,Insertions,6191,0.51
7,Deletions,7462,0.62


## Long ARG

In [40]:
mutation_breakdown(ti_long)

Unnamed: 0,property,count,percentage
0,total,1062072,100.0
1,Recurrent,50099,4.72
2,Reversions,48226,4.54
3,Private,767111,72.23
4,Transitions,783773,73.8
5,Transversions,270333,25.45
6,Insertions,2814,0.26
7,Deletions,5152,0.49
