In [10]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=20, use_memory_fs=False)

import os
from ete3 import Tree
import random

import sys
sys.path.append("/groups/itay_mayrose/halabikeren/tmp/ploidb/data_processing/")
from check_tree_monophyly import get_largest_monophyletic_group

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
ccdb_unresolved_names_path = "/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/ccdb_unresolved_names.csv"
allotb_unresolved_names_path = "/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/ALLOTB_tree_unresolved_names.csv"
allotb_tree_path = "/groups/itay_mayrose/halabikeren/PloiDB/trees/ALLMB.tre" 

ccdb_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_ccdb.csv"
tree_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_tree.csv"
intersection_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_intersection.csv"

unresolved_tree_path = "/groups/itay_mayrose/halabikeren/PloiDB/trees/ALLOTB.tre"
resolved_tree_path = "/groups/itay_mayrose/halabikeren/PloiDB/trees/resolved_ALLOTB.nwk"
selected_tree_leaves_path = "/groups/itay_mayrose/halabikeren/PloiDB/trees/selected_ALLOTB_original_names.csv"

unresolved_ccdb_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/all_data.csv"
resolved_ccdb_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/resolved_data.csv"

In [3]:
ccdb_unresolved_names = pd.read_csv(ccdb_unresolved_names_path)
tree_unresolved_names = pd.read_csv(allotb_unresolved_names_path)

In [4]:
def process_tree(path: str) -> Tree:
    tree = Tree(path, format=1)
    names = set()
    for leaf in tree.get_leaves():
        leaf_name = leaf.name.lower().replace("_", " ")
        if leaf_name in names:
            print(f"{leaf_name} already in tree")
            leaf.detach()
        else:
            leaf.name = leaf_name
            names.add(leaf_name)
    return tree
    
unresolved_tree = process_tree(unresolved_tree_path)

In [6]:
def process_resolved_names(path: str) -> pd.DataFrame:
    resolved_names = pd.read_csv(path).drop("Unnamed: 0", axis=1)
    resolved_names.rename(columns={"search_string": "corrected_original_name", 
                                   "unique_name": "resolved_name"}, inplace=True)
    resolved_names.original_name = resolved_names.original_name.str.lower()
    resolved_names.corrected_original_name = resolved_names.corrected_original_name.str.lower()
    resolved_names.resolved_name = resolved_names.resolved_name.str.lower()
    resolved_names.sort_values("ott_id", inplace=True)
    resolved_names = resolved_names[["original_name", "corrected_original_name", "resolved_name", "ott_id"]]
    return resolved_names

ccdb_resolved_names = process_resolved_names(path=ccdb_resolved_names_path)
tree_resolved_names = process_resolved_names(path=tree_resolved_names_path)
intersection_resolved_names = process_resolved_names(path=intersection_resolved_names_path)
print(f"ccdb coverage = {np.round(ccdb_resolved_names.shape[0]/ccdb_unresolved_names.shape[0]*100,2)}%")
print(f"tree coverage = {np.round(tree_resolved_names.shape[0]/tree_unresolved_names.shape[0]*100,2)}%")
print(f"intersection coverage = {np.round(len(intersection_resolved_names.ott_id.unique())/len(tree_resolved_names.ott_id.unique())*100,2)}%")

ccdb coverage = 90.0%
tree coverage = 96.96%
intersection coverage = 19.34%


In [12]:
def add_resolved_names(tree: Tree, resolved_names: pd.DataFrame):
    orig_to_resolved = resolved_names.set_index("original_name")["resolved_name"].to_dict()
    for leaf in tree.get_leaves():
        leaf.add_feature(pr_name="resolved_name", pr_value=orig_to_resolved[leaf.name])


def select_orig_name(record: pd.Series, tree: Tree, ccdb_orig_names: list[str]) -> str:
    resolved_name = record.resolved_name
    orig_names = record.original_names
    if len(orig_names) == 1:
        return orig_names[0]
    is_monophyletic, clade_type, monophyly_violators = tree.check_monophyly(values=[resolved_name], target_attr="resolved_name")
    if is_monophyletic:
        return random.choice(orig_names)
    else:
        lca = tree
        try:
            lca = tree.get_common_ancestor([l for l in tree.get_leaves() if l.resolved_name == resolved_name])
        except Exception as e:
            print(f"Couldn't find lca for {', '.join(orig_names)} due to error {e}, and will thus set the root as lca")
        mono_root, mono_size, is_larger_than_rest = get_largest_monophyletic_group(root=lca, property_value=resolved_name, property_name="resolved_name")
        if is_larger_than_rest:
            return random.choice(mono_root.get_leaf_names())
        else:
            orig_names_in_ccdb = [name for name in orig_names if name in ccdb_orig_names]
            if len(orig_names_in_ccdb) == 1:
                return orig_names_in_ccdb[0]
    return random.choice(orig_names)
  
    
def resolve_tree(tree: Tree, resolved_names: pd.DataFrame, ccdb_orig_names: list[str]) -> tuple[Tree, pd.DataFrame]:
    resolved_tree = tree.copy()
    print(f"original number of leaves = {len(resolved_tree.get_leaf_names()):,}")
    leaves_to_keep = resolved_names.original_name.tolist()
    resolved_tree.prune(leaves_to_keep)
    add_resolved_names(tree=resolved_tree, resolved_names=resolved_names)
    print(f"number of leaves after prunning of unmapped names to ccdb = {len(resolved_tree.get_leaf_names()):,}")
    resolved_to_orig = resolved_names.groupby("resolved_name")["original_name"].apply(lambda names: names.tolist()).reset_index().rename(columns={"original_name": "original_names"})
    resolved_to_orig["selected_original_name"] = resolved_to_orig.parallel_apply(lambda record: select_orig_name(record=record, tree=resolved_tree, ccdb_orig_names=ccdb_orig_names), axis=1)
    resolved_tree.prune(resolved_to_orig["selected_original_name"].tolist())
    print(f"number of leaves after removal of leaves with identical resolved name = {len(resolved_tree.get_leaf_names()):,}")
    for leaf in resolved_tree.get_leaves():
        leaf.name = leaf.resolved_name
    return resolved_tree, resolved_to_orig


resolved_tree, resolved_to_selected_orig = resolve_tree(tree=unresolved_tree, resolved_names=intersection_resolved_names, ccdb_orig_names=ccdb_resolved_names.original_name.tolist())
resolved_tree.write(outfile=resolved_tree_path, format=1)
resolved_to_selected_orig.to_csv(selected_tree_leaves_path, index=False)

original number of leaves = 353,185
number of leaves after prunning of unmapped names to ccdb = 74,569


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2996), Label(value='0 / 2996'))), …

number of leaves after removal of leaves with identical resolved name = 59,909


In [22]:
resolved_to_selected_orig["num_original_names"] = resolved_to_selected_orig.original_names.apply(len)
resolved_to_selected_orig.sort_values("num_original_names", ascending=False).loc[resolved_to_selected_orig.num_original_names > 2]

Unnamed: 0,resolved_name,original_names,selected_original_name,num_original_names
55126,taraxacum sp.,"[taraxacum sp. abc845, taraxacum sp. ad972, ta...",taraxacum sp. js 8129,64
48421,rubus setosus,"[rubus sp. ms-2014l, rubus sp. dis6, rubus sp....",rubus sp. dis12,63
50053,schismatoglottis tecturata,"[schismatoglottis sp. sll-2016c, schismatoglot...",schismatoglottis sp. ar4096,49
40648,pandanus tectorius,"[pandanus acuminatus, pandanus soboliferus, pa...",pandanus tectorius,33
48005,rosa canina,"[rosa dumetorum, rosa adenocalyx, rosa ambigua...",rosa semiglandulosa,23
...,...,...,...,...
58915,viola nephrophylla,"[viola mccabeiana, viola austinae, viola nephr...",viola nephrophylla,3
21169,elymus panormitanus,"[elymus panormitanus, agropyron panormitanum, ...",agropyron panormitanum,3
45047,potamogeton fluitans,"[potamogeton oblongus, potamogeton harzii, pot...",potamogeton oblongus,3
19026,dichanthelium depauperatum,"[panicum muehlenbergianum, dichanthelium depau...",panicum incomptum,3
