In [1]:
import pandas as pd
from ete3 import Tree
from typing import List



In [2]:
ploidb_dir = "/groups/itay_mayrose/halabikeren/PloiDB/"
tree_1_path = f"{ploidb_dir}trees/ALLMB.tre"
tree_2_path = f"{ploidb_dir}/trees/ALLOTB.tre"
ccdb_data_path = f"{ploidb_dir}/ccdb/all_data.csv"

tree_names_path = f"{ploidb_dir}/name_resolution/trees_unresolved_names.csv"
tree_1_names_path = f"{ploidb_dir}/name_resolution/ALLMB_tree_unresolved_names.csv"
tree_2_names_path = f"{ploidb_dir}/name_resolution/ALLOTB_tree_unresolved_names.csv"
ccdb_names_path = f"{ploidb_dir}/name_resolution/ccdb_unresolved_names.csv"
all_names_path = f"{ploidb_dir}/name_resolution/all_unresolved_names.csv"

In [3]:
# extract unresolved names from the trees
def get_tree_names(tree_path) -> List[str]:
    tree = Tree(tree_path, format=1)
    tree_leaf_names = [name.replace("_"," ").capitalize() for name in tree.get_leaf_names()]
    return tree_leaf_names

tree_1_names = get_tree_names(tree_path=tree_1_path)
tree_2_names = get_tree_names(tree_path=tree_2_path)
trees_names = list(set(tree_1_names + tree_2_names))
tree_2_names = list(set(tree_2_names))

In [4]:
# extract unresolved names from ccdb
df = pd.read_csv("/groups/itay_mayrose/halabikeren/PloiDB/ccdb/all_data.csv")
ccdb_names = df.original_name.tolist()

In [5]:
tree_df = pd.DataFrame(pd.Series(trees_names), columns=["species_name"]).drop_duplicates()
print(f"# unresolved names from trees = {tree_df.shape[0]:,}")
tree_df.to_csv(tree_names_path, index=False)

tree_1_df = pd.DataFrame(pd.Series(tree_1_names), columns=["species_name"]).drop_duplicates()
print(f"# unresolved names from ALLMB = {tree_1_df.shape[0]:,}")
tree_1_df.to_csv(tree_1_names_path, index=False)


tree_2_df = pd.DataFrame(pd.Series(tree_2_names), columns=["species_name"]).drop_duplicates()
print(f"# unresolved names from ALLOTB = {tree_2_df.shape[0]:,}")
tree_2_df.to_csv(tree_2_names_path, index=False)

ccdb_df = pd.DataFrame(pd.Series(ccdb_names), columns=["species_name"]).drop_duplicates()
print(f"# unresolved names from ccdb = {ccdb_df.shape[0]:,}")
ccdb_df.to_csv(ccdb_names_path, index=False)

all_names_df = pd.concat([ccdb_df, tree_df]).drop_duplicates()
print(f"# unresolved names from all = {all_names_df.shape[0]:,}")
all_names_df.to_csv(all_names_path, index=False)

# unresolved names from trees = 359,382
# unresolved names from ALLMB = 356,305
# unresolved names from ALLOTB = 353,185
# unresolved names from ccdb = 180,463
# unresolved names from all = 518,874


In [21]:
taxonome_tree_df = tree_df.reset_index().rename(columns={"index": "Id", "species_name":"Name"})
taxonome_tree_df.to_csv("./resolved_names_different_methods/taxonome/unresolved_tree_names_taxonome_format.csv", index=False)

taxonome_tree_2_df = tree_2_df.reset_index().rename(columns={"index": "Id", "species_name":"Name"})
taxonome_tree_2_df.to_csv("./resolved_names_different_methods/taxonome/unresolved_ALLOTB_tree_names_taxonome_format.csv", index=False)

taxonome_ccdb_df = ccdb_df.reset_index().rename(columns={"index": "Id", "species_name":"Name"})
taxonome_ccdb_df.to_csv("./resolved_names_different_methods/taxonome/unresolved_ccdb_names_taxonome_format.csv", index=False)

taxonome_unresolved_names = all_names_df.reset_index().rename(columns={"index": "Id", "species_name":"Name"})
taxonome_unresolved_names.to_csv("./resolved_names_different_methods/taxonome/unresolved_names_taxonome_format.csv", index=False)