Shift breakpoint interval coordinates

This is a temporary solution. See this issue:

https://github.com/jeromekelleher/sc2ts-paper/issues/364

In [1]:
import numpy as np
import pandas as pd
import tszip
import sc2ts

In [2]:
# Ts file with shifted breakpoint intervals.
ts_file = "../inference/v1-beta1_2023-02-21.pp.md.bpshift.ts.dated.il.tsz"
ts = tszip.decompress(ts_file)

In [3]:
ti = sc2ts.info.TreeInfo(ts)
ti

Counting descendants :   0%|          | 0/2689054 [00:00<?, ?it/s]

Indexing metadata    :   0%|          | 0/2689054 [00:00<?, ?it/s]

Classifying mutations:   0%|          | 0/1922947 [00:00<?, ?it/s]

Unnamed: 0_level_0,value
property,Unnamed: 1_level_1
latest_sample,2023-02-20
samples,2482157
nodes,2689054
exact_matches,1252208
mc_nodes,73166
pr_nodes,40099
re_nodes,929
imr_nodes,53
mutations,1922947
recurrent,34493


In [4]:
# Recombinants data file with unshifted breakpoint intervals.
# https://github.com/jeromekelleher/sc2ts-paper/commit/c7e26e629a955ef093cd5d9aed6d7c47daf2ee2a
old_file = "../data/recombinants.csv"
old_df = pd.read_csv(old_file)
old_df.head(1)

Unnamed: 0.1,Unnamed: 0,recombinant,descendants,sample,sample_id,sample_pango,interval_left,interval_right,num_mutations,date_added,group_id,parent_left,parent_left_pango,parent_right,parent_right_pango,mrca,t_mrca,diffs,max_run_length
0,766,1280342,1,1280341,ERR9939974,BA.4.1,695,958,1,2022-06-27,96ff31d5f4931e21077b36d955f3d19b,1253363,BA.5.2.1,1232376,BA.4.1,59,1114.000001,25,1


In [5]:
data = []
for u in ti.recombinants:
    interval_right_shifted = int(max(ts.edges_left[ts.edges_child == u]))
    data.append(
        {
            "recombinant": u,
            "interval_right_shifted": interval_right_shifted,
        }
    )
new_df = pd.DataFrame(data)
new_df.head(1)

Unnamed: 0,recombinant,interval_right_shifted
0,1530,13617


In [6]:
merged_df = new_df.merge(old_df, on="recombinant")
merged_df.head(1)

Unnamed: 0.1,recombinant,interval_right_shifted,Unnamed: 0,descendants,sample,sample_id,sample_pango,interval_left,interval_right,num_mutations,date_added,group_id,parent_left,parent_left_pango,parent_right,parent_right_pango,mrca,t_mrca,diffs,max_run_length
0,1530,13617,0,1,1529,ERR4437465,B.1.157,8783,13617,2,2020-03-22,052b938d3189a1c873abd3ffd894e4c6,1121,A.5,832,B.1.157,1,1153.0,13,0


In [7]:
num_diffs = np.sum(merged_df["interval_right_shifted"] != merged_df["interval_right"])
print(f"Total: {len(merged_df)}")
print(f"Diff: {num_diffs}")

Total: 929
Diff: 102


In [8]:
data = []
for i, row in merged_df.iterrows():
    recombinant = row["recombinant"]
    parent_left = row["parent_left"]
    parent_right = row["parent_right"]
    if row["interval_right_shifted"] != row["interval_right"]:
        interval_right = row["interval_right_shifted"]
        interval_left = sc2ts.utils.compute_left_bound(
            ts,
            parents=[parent_left, parent_right],
            right=interval_right,
        )
    else:
        interval_left = row["interval_left"]
        interval_right = row["interval_right"]
    data.append(
        {
            "recombinant": recombinant,
            "descendants": row["descendants"],
            "sample": row["sample"],
            "sample_id": row["sample_id"],
            "sample_pango": row["sample_pango"],
            "interval_left": interval_left,
            "interval_right": interval_right,
            "num_mutations": row["num_mutations"],
            "date_added": row["date_added"],
            "group_id": row["group_id"],
            "parent_left": parent_left,
            "parent_left_pango": row["parent_left_pango"],
            "parent_right": parent_right,
            "parent_right_pango": row["parent_right_pango"],
            "mrca": row["mrca"],
            "t_mrca": row["t_mrca"],
            "diffs": row["diffs"],
            "max_run_length": row["max_run_length"],
        }
    )
updated_df = pd.DataFrame(data)
updated_df.head(1)

Unnamed: 0,recombinant,descendants,sample,sample_id,sample_pango,interval_left,interval_right,num_mutations,date_added,group_id,parent_left,parent_left_pango,parent_right,parent_right_pango,mrca,t_mrca,diffs,max_run_length
0,1530,1,1529,ERR4437465,B.1.157,8783,13617,2,2020-03-22,052b938d3189a1c873abd3ffd894e4c6,1121,A.5,832,B.1.157,1,1153.0,13,0


In [9]:
updated_df.to_csv(
    "../data/recombinants_bpshift.csv",
    index=False,
)