In [1]:
import pandas as pd
import numpy as np
import os
from ete3 import Tree

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, use_memory_fs=False)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
partition_wo_missing_roots_path = "../trees/optimal_ALLMB_roots_for_genus_partition.csv"
partition_w_missing_roots_path = "../trees/optimal_by_wo_missing_ALLMB_roots_for_genus_partition.csv"

tree_wo_missing_path = f"../trees/resolved_ALLMB_name_resolution_on_none_with_added_ccdb_names.nwk"
tree_w_missing_path = f"../trees/resolved_ALLMB_name_resolution_on_none_with_added_ccdb_and_wo_counts_names.nwk"

ccdb_path = f"../ccdb/resolved_data_name_resolved_on_none.csv"

In [3]:
partition_roots_wo_missing = pd.read_csv(partition_wo_missing_roots_path)

tree_wo_missing = Tree(tree_wo_missing_path, format=1)
for leaf in tree_wo_missing.get_leaves():
    leaf.name = leaf.name.replace("_", " ")

tree_w_missing = Tree(tree_w_missing_path, format=1)
for leaf in tree_w_missing.get_leaves():
    leaf.name = leaf.name.replace("_", " ")

In [4]:
partition_roots_w_missing = partition_roots_wo_missing.copy()


def get_updated_genus(record: pd.Series) -> list:
    genus = record.genus
    try:
        root_name_wo_missing = record.node
        members_wo_missing = tree_wo_missing.search_nodes(name=root_name_wo_missing)[0].get_leaf_names()
        assert len(members_wo_missing) == record.size_subtree
        members_wo_missing = set(members_wo_missing) & set(tree_w_missing.get_leaf_names())
        root_w_missing = tree_w_missing.get_common_ancestor(members_wo_missing)
        root_name_w_missing = root_w_missing.name
        num_members_w_missing = len([l for l in root_w_missing.get_leaf_names() if l.startswith(genus)])
        size_subtree_w_missing = len(root_w_missing.get_leaves())
        return root_name_w_missing, num_members_w_missing, size_subtree_w_missing
    except Exception as e:
        print(f"error from genus {genus}: {e}")
        return np.nan, np.nan, np.nan


In [6]:
partition_roots_w_missing[["node", "num_members", "size_subtree"]] = partition_roots_w_missing.parallel_apply(
    get_updated_genus, axis=1, result_type="expand"
)
partition_roots_w_missing.to_csv(partition_w_missing_roots_path)