In [1]:
from ete3 import Tree
import pandas as pd
import numpy as np
import itertools
import os
from collections import defaultdict

from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=30, use_memory_fs=False)



INFO: Pandarallel will run on 30 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [17]:
tree_name = "ALLMB" # or "ALLOTB"
resolve_plant_names = False

unresolved_plant_names_path = "/groups/itay_mayrose/halabikeren/plant_pollinator_networks/name_resolution/unresolved_plant_names.csv"
resolved_plant_names_path = "/groups/itay_mayrose/halabikeren/plant_pollinator_networks/name_resolution/resolved_plant_names.csv"
tree_path = f"/groups/itay_mayrose/halabikeren/plant_pollinator_networks/trees/{tree_name}.tre"

expended_tree_path = f"/groups/itay_mayrose/halabikeren/plant_pollinator_networks/trees/{tree_name}_expanded_by_{'resolved' if resolve_plant_names else 'unresolved'}_names.tre"

In [18]:
def process_tree(tree_path: str) -> Tree:
    tree = Tree(tree_path, format=1)
    for leaf in tree.get_leaves():
        leaf.name = leaf.name.replace("_"," ").lower()
    return tree

tree = process_tree(tree_path)

In [19]:
unresolved_plant_names = pd.read_csv(unresolved_plant_names_path).Name.dropna().str.lower().unique().tolist()
resolved_plant_names = pd.read_csv(resolved_plant_names_path).resolved_name.dropna().str.lower().unique().tolist()
plant_names = resolved_plant_names if resolve_plant_names else unresolved_plant_names

print(f"# unresolved_plant_names = {len(unresolved_plant_names):,}")
print(f"# resolved_plant_names = {len(resolved_plant_names):,}")

# unresolved_plant_names = 5,322
# resolved_plant_names = 3,556


In [20]:
tree_names = set(tree.get_leaf_names())

unresolved_plant_names_in_tree = list(tree_names.intersection(set(unresolved_plant_names)))
resolved_plant_names_in_tree = list(tree_names.intersection(set(resolved_plant_names)))
plant_names_in_tree = resolved_plant_names_in_tree if resolve_plant_names else unresolved_plant_names_in_tree

unresolved_plant_names_not_in_tree = list(set(unresolved_plant_names)-set(unresolved_plant_names_in_tree))
resolved_plant_names_not_in_tree = list(set(resolved_plant_names)-set(resolved_plant_names_in_tree))
plant_names_not_in_tree = resolved_plant_names_not_in_tree if resolve_plant_names else unresolved_plant_names_not_in_tree

print(f"# unresolved plant names that are present in the tree = {len(unresolved_plant_names_in_tree):,}")
print(f"# resolved plant names that are present in the tree = {len(resolved_plant_names_in_tree):,}")

# unresolved plant names that are present in the tree = 2,373
# resolved plant names that are present in the tree = 2,544


In [21]:
# compute names that can be added to the tree ad direct children of their genus ancestor
tree_genera = set([name.split(" ")[0] for name in tree.get_leaf_names()])
names_genera = set([name.split(" ")[0] for name in plant_names])

unresolved_missing_names_that_can_be_added = [name for name in unresolved_plant_names_not_in_tree if name.split(" ")[0] in tree_genera]
resolved_missing_names_that_can_be_added = [name for name in resolved_plant_names_not_in_tree if name.split(" ")[0] in tree_genera]

print(f"# out of {len(unresolved_plant_names_not_in_tree):,} missing unresolved names in the tree, {len(unresolved_missing_names_that_can_be_added):,} can be added to the tree as direct children of their genus ancestor")
print(f"# out of {len(resolved_plant_names_not_in_tree):,} missing resolved names in the tree, {len(resolved_missing_names_that_can_be_added):,} can be added to the tree as direct children of their genus ancestor")

# out of 2,949 missing unresolved names in the tree, 2,264 can be added to the tree as direct children of their genus ancestor
# out of 1,012 missing resolved names in the tree, 949 can be added to the tree as direct children of their genus ancestor


In [23]:
tree_with_addition = tree.copy()
names_to_keep = [name for name in tree.get_leaf_names() if name.split(" ")[0] in names_genera]
tree_with_addition.prune(names_to_keep, preserve_branch_length=True)

names_to_add_to_tree = resolved_missing_names_that_can_be_added if resolve_plant_names else unresolved_missing_names_that_can_be_added 
print(f"# names that will be added to the tree = {len(names_to_add_to_tree):,}")

genus_to_names_to_add = defaultdict(list)
for name in names_to_add_to_tree:
    genus = name.split(" ")[0]
    if genus in tree_genera:
        genus_to_names_to_add[genus].append(name)
print(f"# genera to add direct children to {len(genus_to_names_to_add):,}")

genus_to_tree_names = defaultdict(list)
for leaf_name in tree_with_addition.get_leaf_names():
    genus = leaf_name.split(" ")[0]
    genus_to_tree_names[genus].append(leaf_name)

print(f"computing lca per genus across {len(genus_to_names_to_add):,} genera")
genus_to_ancestor = dict()
for genus in genus_to_names_to_add:
    if genus in tree_genera:
        genus_names = genus_to_tree_names[genus]
        assert(len(genus_names) > 0)
        if len(genus_names) == 1:
            genus_to_ancestor[genus] = tree_with_addition.search_nodes(name=genus_names[0])[0].up
            continue
        try:
            genus_to_ancestor[genus] = tree_with_addition.get_common_ancestor(genus_names)
        except Exception as e:
            print(f"could not find the ancestor of genus {genus} spannig species {','.join(genus_names)} due to error {e}")

print(f"adding missing species under lca per genus across {len(genus_to_ancestor):,} genera")
for genus in genus_to_ancestor:
    ancestor = genus_to_ancestor[genus]
    names = set(genus_to_names_to_add[genus]) - set(ancestor.get_leaf_names())
    time_to_leaf = ancestor.get_distance(ancestor.get_leaf_names()[0])
    for name in names:
        leaf = ancestor.add_child(name=name, dist=time_to_leaf)

plant_names_for_tree = plant_names_in_tree + names_to_add_to_tree
tree_with_addition.prune(plant_names_for_tree, preserve_branch_length=True)

print(f"# leafs in new tree = {len(tree_with_addition.get_leaf_names()):,}")
tree_with_addition.write(outfile=expended_tree_path)

# names that will be added to the tree = 2,264
# genera to add direct children to 1,282
computing lca per genus across 1,282 genera
adding missing species under lca per genus across 1,282 genera
# leafs in new tree = 4,637
