# Imputing lineages for reconstructed internal nodes

In [1]:
import tskit
import tszip

import numpy as np
from sklearn import tree
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import pandas as pd

import sys
sys.path.append("../")
import sc2ts.utils
import sc2ts.lineages as lineages

In [2]:
# Read in lineage defining mutations
# This is from covidcg.org (13 Jan 2023):
# Compare AA mutations -> download -> consensus mutations -> NT, 0.9 consensus threshold
linmuts_dict = lineages.read_in_mutations("../sc2ts/data/consensus_mutations.json")

Multiallelic sites: 272 out of 4170
Number of lineages: 2046


In [3]:
# Read in lineage defining mutations as a data frame
# ** Assuming here B is the root lineage **
df, df_ohe, ohe = lineages.read_in_mutations_json("../sc2ts/data/consensus_mutations.json")

In [4]:
# Make a decision tree
y = df_ohe.index # lineage labels
clf = tree.DecisionTreeClassifier()
clf = clf.fit(df_ohe, y)

In [5]:
# Check tree works
y_pred = clf.predict(df_ohe)
correct = incorrect = lineage_definition_issue = 0
for yy, yy_pred in zip(y, y_pred):
    if yy == yy_pred:
        correct += 1
    else:
        incorrect += 1
        if linmuts_dict.get_mutations(yy) == linmuts_dict.get_mutations(yy_pred):
            lineage_definition_issue += 1
            print(yy_pred, "same mutations as", yy)
print(correct, incorrect, lineage_definition_issue)  

AY.104 same mutations as AY.110
AY.120 same mutations as AY.120.2
AY.104 same mutations as AY.131
AY.104 same mutations as AY.33.1
AY.102 same mutations as AY.78
AY.104 same mutations as AY.92
B.1.1 same mutations as B.1.1.174
B.1.1.142 same mutations as B.1.1.257
B.1.1.319 same mutations as B.1.1.370
B.1.1 same mutations as B.1.1.399
B.1.1.319 same mutations as B.1.1.409
B.1.1 same mutations as B.1.1.411
B.1.1 same mutations as B.1.1.419
B.1.1 same mutations as B.1.1.426
B.1 same mutations as B.1.169
B.1 same mutations as B.1.247
B.1 same mutations as B.1.284
B.1.119 same mutations as B.1.326
B.1 same mutations as B.1.383
B.1.119 same mutations as B.1.434
B.1 same mutations as B.1.510
B.1 same mutations as B.1.535
B.1.369 same mutations as B.1.564
B.1.119 same mutations as B.1.576
B.1.119 same mutations as B.1.579
B.1.521 same mutations as B.1.590
B.1.9 same mutations as B.1.9.3
BA.2 same mutations as BA.2.65
B.1.1.33 same mutations as N.8
2017 29 29


Have an issue with the list as some lineages have the same characteristic mutations.

# ts lineage imputation

In [6]:
ts_path = "../../sc2ts_ts/upgma-full-md-30-mm-3-2020-02-01"

ts = tskit.load(ts_path + ".ts")
ti = sc2ts.utils.TreeInfo(ts)
il, edited_ts = lineages.impute_lineages(ts, ti, linmuts_dict, df, ohe, clf, internal_only = False)
edited_ts.dump(ts_path + "-il.ts")
tszip.compress(edited_ts, ts_path + "-il.ts.tsz")

Counting descendants: 100%|██████████████████████████████████████| 122/122 [00:00<00:00, 1938276.85it/s]
Indexing metadata   : 100%|███████████████████████████████████████| 122/122 [00:00<00:00, 106939.41it/s]

Recording relevant mutations for each node...





  0%|          | 0/154 [00:00<?, ?it/s]

Inferring lineages...


  0%|          | 0/121 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 108 out of possible 108
Internal nodes imputed: 14 out of possible 14
Total imputed: 122 out of possible 122
Number of recombinants (not imputed): 0
------------------------------
Correctly imputed samples: 106 ( 98.148 % )
Incorrectly imputed samples: 2 ( 1.852 % )
Imputed using inheritance: 92 ( 75.41 % ) decision tree: 30 ( 24.59 % )
------------------------------
Time: 0.7662479877471924


In [7]:
ts_path = "../../sc2ts_ts/mp-mds-1000-md-30-mm-3-2022-06-30"

ts = tskit.load(ts_path + ".ts")
ti = sc2ts.utils.TreeInfo(ts)
il, edited_ts = lineages.impute_lineages(ts, ti, linmuts_dict, df, ohe, clf, internal_only = False)
edited_ts.dump(ts_path + "-il.ts")
tszip.compress(edited_ts, ts_path + "-il.ts.tsz")

Counting descendants: 100%|████████████████████████████████| 783305/783305 [00:00<00:00, 3333325.85it/s]
Indexing metadata   : 100%|██████████████████████████████████| 783305/783305 [00:08<00:00, 95646.41it/s]


Recording relevant mutations for each node...


  0%|          | 0/1060606 [00:00<?, ?it/s]

Inferring lineages...


  0%|          | 0/781215 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 657239 out of possible 657239
Internal nodes imputed: 123977 out of possible 123977
Total imputed: 781216 out of possible 781216
Number of recombinants (not imputed): 2089
------------------------------
Correctly imputed samples: 638748 ( 97.666 % )
Incorrectly imputed samples: 15263 ( 2.334 % )
Imputed using inheritance: 518323 ( 66.348 % ) decision tree: 262893 ( 33.652 % )
------------------------------
Time: 729.4269618988037


In [8]:
ts_path = "../../sc2ts_ts/mp-full-md-30-mm-3-2021-05-09"

ts = tskit.load(ts_path + ".ts")
ti = sc2ts.utils.TreeInfo(ts)
il, edited_ts = lineages.impute_lineages(ts, ti, linmuts_dict, df, ohe, clf, internal_only = False)
edited_ts.dump(ts_path + "-il.ts")
tszip.compress(edited_ts, ts_path + "-il.ts.tsz")

Counting descendants: 100%|██████████████████████████████| 1098284/1098284 [00:00<00:00, 3331373.73it/s]
Indexing metadata   : 100%|████████████████████████████████| 1098284/1098284 [00:14<00:00, 73537.90it/s]


Recording relevant mutations for each node...


  0%|          | 0/944707 [00:00<?, ?it/s]

Inferring lineages...


  0%|          | 0/1095219 [00:00<?, ?it/s]

------------------------------
Sample nodes imputed: 949627 out of possible 949627
Internal nodes imputed: 145593 out of possible 145593
Total imputed: 1095220 out of possible 1095220
Number of recombinants (not imputed): 3064
------------------------------
Correctly imputed samples: 937166 ( 99.126 % )
Incorrectly imputed samples: 8266 ( 0.874 % )
Imputed using inheritance: 870358 ( 79.469 % ) decision tree: 224862 ( 20.531 % )
------------------------------
Time: 782.6351850032806
