# Imputing lineages for reconstructed internal nodes

In [1]:
import tskit
import tszip
import pandas as pd
import tqdm

import sys
sys.path.append("../")
import sc2ts.utils
import sc2ts.lineages

In [2]:
ts_long_path = "../../sc2ts_ts/upgma-mds-1000-md-30-mm-3-2022-06-30-recinfo"
ts_wide_path = "../../sc2ts_ts/upgma-full-md-30-mm-3-2021-06-30-recinfo"
ts_long = tszip.decompress(ts_long_path + "-il.ts.tsz")
ts_wide = tszip.decompress(ts_wide_path + "-il.ts.tsz")
ti_long = sc2ts.utils.TreeInfo(ts_long)
ti_wide = sc2ts.utils.TreeInfo(ts_wide)
mutations_json_filepath = "../../sc2ts_ts/consensus_mutations.json"
gisaid_metadata_filepath = "../../sc2ts_ts/metadata_tsv_2023_03_09/metadata.tsv"

Counting descendants : 100%|███████████████████████████████████████| 783231/783231 [00:00<00:00, 3460493.66it/s]
Indexing metadata    : 100%|█████████████████████████████████████████| 783231/783231 [00:08<00:00, 93027.58it/s]
Classifying mutations: 100%|██████████████████████████████████████| 1062072/1062072 [00:07<00:00, 142610.22it/s]
Counting descendants : 100%|█████████████████████████████████████| 1453347/1453347 [00:00<00:00, 3336969.57it/s]
Indexing metadata    : 100%|███████████████████████████████████████| 1453347/1453347 [00:16<00:00, 87669.72it/s]
Classifying mutations: 100%|██████████████████████████████████████| 1213193/1213193 [00:08<00:00, 141461.46it/s]


# GISAID vs Nextclade lineage comparison

In [3]:
md = pd.read_table(gisaid_metadata_filepath)

  md = pd.read_table(gisaid_metadata_filepath)


In [4]:
gisaid_data = [(x,y) for x, y in zip(md['Accession ID'], md['Pango lineage'])]

In [5]:
linmuts_dict = sc2ts.lineages.read_in_mutations(mutations_json_filepath)

In [6]:
ts_long_gisaid = sc2ts.utils.check_lineages(
    ts_long,
    ti_long,
    gisaid_data,
    linmuts_dict,
    diff_filehandle='../../sc2ts_ts/lineage_disagreement_long',
)

100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:15<00:00, 982823.61it/s]


ts number of samples: 657239
number matched to gisaid data: 657168
number of differences: 46311
proportion: 0.0704705646044847
Filling in missing GISAID lineages with Nextclade lineages: 185


In [7]:
ts_wide_gisaid = sc2ts.utils.check_lineages(
    ts_wide,
    ti_wide,
    gisaid_data,
    linmuts_dict,
    diff_filehandle='../../sc2ts_ts/lineage_disagreement_wide',
)

100%|███████████████████████████████████████████████████████████| 15115274/15115274 [00:21<00:00, 715844.46it/s]


ts number of samples: 1265685
number matched to gisaid data: 1265683
number of differences: 65677
proportion: 0.05189056027457112
Filling in missing GISAID lineages with Nextclade lineages: 0


# ts lineage imputation

In [8]:
edited_ts_long = sc2ts.utils.lineage_imputation(
    mutations_json_filepath,
    ts_long_gisaid, 
    ti_long,
    internal_only=False,
    verbose=False
)

Recording relevant mutations for each node...


  0%|          | 0/1062072 [00:00<?, ?it/s]

Inferring lineages...


  0%|          | 0/781152 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 657239 out of possible 657239
Internal nodes imputed: 123914 out of possible 123914
Total imputed: 781153 out of possible 781153
Number of recombinants (not imputed): 2078
------------------------------
Correctly imputed samples: 639658 ( 97.789 % )
Incorrectly imputed samples: 14460 ( 2.211 % )
Imputed using inheritance: 518270 ( 66.347 % ) decision tree: 262883 ( 33.653 % )
------------------------------
Time: 328.4449107646942
Inferring lineages...


  0%|          | 0/781152 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 657205 out of possible 657239
Internal nodes imputed: 123948 out of possible 123914
Total imputed: 781153 out of possible 781153
Number of recombinants (not imputed): 2078
------------------------------
Correctly imputed samples: 634978 ( 97.084 % )
Incorrectly imputed samples: 19070 ( 2.916 % )
Imputed using inheritance: 518268 ( 66.347 % ) decision tree: 262885 ( 33.653 % )
------------------------------
Time: 355.47603726387024


In [9]:
edited_ts_long.dump(ts_long_path + "-gisaid-il.ts")
tszip.compress(edited_ts_long, ts_long_path + "-gisaid-il.ts.tsz")

In [10]:
correct = total = 0
for node in edited_ts_long.nodes():
    if 'GISAID_lineage' not in node.metadata and 'Imputed_GISAID_lineage' in node.metadata and 'Nextclade_pango' not in node.metadata and 'Imputed_Nextclade_pango' in node.metadata:
        if node.metadata['Imputed_GISAID_lineage'] == node.metadata['Imputed_Nextclade_pango']:
            correct += 1
        total += 1
print(correct/total)

0.9398057019493301


In [11]:
edited_ts_wide = sc2ts.utils.lineage_imputation(
    mutations_json_filepath,
    ts_wide_gisaid, 
    ti_wide,
    internal_only=False,
    verbose=False
)

Recording relevant mutations for each node...


  0%|          | 0/1213193 [00:00<?, ?it/s]

Inferring lineages...


  0%|          | 0/1449223 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 1265685 out of possible 1265685
Internal nodes imputed: 183539 out of possible 183539
Total imputed: 1449224 out of possible 1449224
Number of recombinants (not imputed): 4123
------------------------------
Correctly imputed samples: 1250162 ( 99.203 % )
Incorrectly imputed samples: 10045 ( 0.797 % )
Imputed using inheritance: 1160067 ( 80.047 % ) decision tree: 289157 ( 19.953 % )
------------------------------
Time: 545.9626221656799
Inferring lineages...


  0%|          | 0/1449223 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 1265685 out of possible 1265685
Internal nodes imputed: 183539 out of possible 183539
Total imputed: 1449224 out of possible 1449224
Number of recombinants (not imputed): 4123
------------------------------
Correctly imputed samples: 1244789 ( 98.777 % )
Incorrectly imputed samples: 15416 ( 1.223 % )
Imputed using inheritance: 1160067 ( 80.047 % ) decision tree: 289157 ( 19.953 % )
------------------------------
Time: 561.4245040416718


In [12]:
edited_ts_wide.dump(ts_wide_path + "-gisaid-il.ts")
tszip.compress(edited_ts_wide, ts_wide_path + "-gisaid-il.ts.tsz")

In [13]:
correct = total = 0
for node in edited_ts_wide.nodes():
    if 'GISAID_lineage' not in node.metadata and 'Imputed_GISAID_lineage' in node.metadata and 'Nextclade_pango' not in node.metadata and 'Imputed_Nextclade_pango' in node.metadata:
        if node.metadata['Imputed_GISAID_lineage'] == node.metadata['Imputed_Nextclade_pango']:
            correct += 1
        total += 1
print(correct/total)

0.94598267097228
