In [2]:
name_resolution_dir = "/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/"
unresolved_names_path = f"{name_resolution_dir}unresolved_names.csv"
resolved_names_gnr_all_path = f"{name_resolution_dir}resolved_names_different_methods/gnr_all_resolved_names.csv"

resolved_names_gnr_wfo_path = f"{name_resolution_dir}resolved_names_different_methods/gnr_wfo_resolved_names.csv"
retried_gnr_wfo_resolved_names_path = f"{name_resolution_dir}resolved_names_different_methods/retried_gnr_wfo_resolved_names.csv"

resolved_names_r_wfo_path = f"{name_resolution_dir}resolved_names_different_methods/r_wfo_resolved_names.csv"
retried_r_wfo_resolved_names_path = f"{name_resolution_dir}resolved_names_different_methods/retried_r_wfo_resolved_names.csv"

resolved_names_tnrs_path = f"{name_resolution_dir}resolved_names_different_methods/tnrs_all_resolved_names.csv"

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
from typing import List
from Levenshtein import distance as lev

In [3]:
def get_taxonomic_classification(row: pd.Series, rank: str):
    if pd.isna(row.classification_path_ranks) or pd.isna(row.classification_path):
        return np.nan
    ordered_ranks = row.classification_path_ranks.lower().split("|")
    ordered_names = row.classification_path.lower().split("|")
    rank_index, rank_value = np.nan, np.nan
    try:
        rank_index = ordered_ranks.index(rank)
        rank_value = ordered_names[rank_index]
    except Exception as e:
        if rank == 'species':
            rank_value_components = ordered_names[-1].split(" ")
            rank_value = " ".join(rank_value_components[:2])
        else:
            pass
    return rank_value


def process_resolved_names(path: str) -> pd.DataFrame:
    resolved_names = pd.read_csv(path)
    resolved_names.dropna(subset=["query"], inplace=True)
    resolved_names["query"] = resolved_names["query"].str.lower()
    resolved_names.drop([col for col in resolved_names.columns if col.startswith("Unnamed")], inplace=True, axis=1)
    
    # corrent resolved names of synonim matches based on the parallel accepted names, if available
    if 'name_string' in resolved_names.columns:
        resolved_names["matched_name"].fillna(resolved_names["name_string"].to_dict(), inplace=True)
        name_string_to_matched_name = resolved_names.loc[resolved_names.orig_status == "ACCEPTED"].set_index("name_string")["matched_name"].to_dict()
        resolved_names["matched_name"] = resolved_names[["name_string", "matched_name"]].apply(lambda row: name_string_to_matched_name[row.matched_name] if row.matched_name in name_string_to_matched_name else row.matched_name, axis=1)
    resolved_names.matched_name = resolved_names.matched_name.str.lower()

    # extract the taxonomic rank of the resolved name
    resolved_names.taxon_rank = resolved_names.taxon_rank.str.lower()
    resolved_names.taxon_rank = resolved_names.taxon_rank.replace({'spec.': 'species', 'gen.': 'genus', 'f.': 'form', 'var.': 'variety'})
    if 'classification_path_ranks' in resolved_names.columns:
        resolved_names.classification_path_ranks = resolved_names.classification_path_ranks.apply(lambda x: x.replace('spec.','species').replace('gen.','genus').replace('f.', 'form').replace('var.', 'variety') if pd.notna(x) else np.nan)
        # collapse taxonomic levels lower than species to a species level, namely: 'subspecies', 'variety', 'subvariety', 'form'
        resolved_names.loc[resolved_names.taxon_rank.isin(['subspecies', 'variety', 'subvariety', 'form']), "matched_name"] = resolved_names.loc[resolved_names.taxon_rank.isin(['subspecies', 'variety', 'subvariety', 'form'])].apply(lambda row: get_taxonomic_classification(row, rank="species"), axis=1)
        resolved_names.loc[resolved_names.taxon_rank.isin(['subspecies', 'variety', 'subvariety', 'form']), "taxon_rank"] = "species"
        
    resolved_names.dropna(subset="matched_name", inplace=True)
    return resolved_names

  resolved_names = pd.read_csv(path)


In [None]:
unresolved_names = pd.read_csv(unresolved_names_path)
resolved_names_gnr_all = process_resolved_names(path=resolved_names_gnr_all_path)
resolved_names_gnr_wfo = process_resolved_names(path=resolved_names_gnr_wfo_path)
resolved_names_r_wfo = process_resolved_names(path=resolved_names_r_wfo_path)
resolved_names_tnrs = process_resolved_names(path=resolved_names_tnrs_path)

In [4]:
print(f"% resolved names by GNR(ALL) = {np.round(resolved_names_gnr_all.shape[0]/unresolved_names.shape[0]*100,2)}% ({resolved_names_gnr_all.shape[0]}/{unresolved_names.shape[0]})")
print(f"% resolved names by GNR(WFO) = {np.round(resolved_names_gnr_wfo.shape[0]/unresolved_names.shape[0]*100,2)}% ({resolved_names_gnr_wfo.shape[0]}/{unresolved_names.shape[0]})")
print(f"% resolved names by R(WFO) = {np.round(resolved_names_r_wfo.shape[0]/unresolved_names.shape[0]*100,2)}% ({resolved_names_r_wfo.shape[0]}/{unresolved_names.shape[0]})")
print(f"% resolved names by TNRS = {np.round(resolved_names_tnrs.shape[0]/unresolved_names.shape[0]*100,2)}% ({resolved_names_tnrs.shape[0]}/{unresolved_names.shape[0]})")

% resolved names by GNR(ALL) = 95.27% (493138/517641)
% resolved names by GNR(WFO) = 99.24% (513692/517641)
% resolved names by R(WFO) = 96.31% (498533/517641)
% resolved names by TNRS = 68.69% (355545/517641)


In [6]:
gnr_all_matches = resolved_names_gnr_all[["query", "matched_name", "taxon_rank", "name_sources", "orig_status"]].rename(columns={"matched_name": "gnr_all_matched_name", "taxon_rank": "gnr_all_taxon_rank", "name_sources": "gnr_all_name_sources", "orig_status": "gnr_all_orig_status"})
gnr_wfo_matches = resolved_names_gnr_wfo[["query", "matched_name", "taxon_rank", "name_sources", "orig_status"]].rename(columns={"matched_name": "gnr_wfo_matched_name", "taxon_rank": "gnr_wfo_taxon_rank", "name_sources": "gnr_wfo_name_sources", "orig_status": "gnr_wfo_orig_status"})
r_wfo_matches = resolved_names_r_wfo[["query", "matched_name", "taxon_rank", "name_sources", "orig_status"]].rename(columns={"matched_name": "r_wfo_matched_name", "taxon_rank": "r_wfo_taxon_rank", "name_sources": "r_wfo_name_sources", "orig_status": "r_wfo_orig_status"})
tnrs_matches = resolved_names_tnrs[["query", "matched_name", "taxon_rank", "name_sources", "orig_status"]].rename(columns={"matched_name": "tnrs_matched_name", "taxon_rank": "tnrs_taxon_rank", "name_sources": "tnrs_name_sources", "orig_status": "tnrs_orig_status"})

contradictions = gnr_all_matches.merge(gnr_wfo_matches, on="query", how="outer").merge(r_wfo_matches, on="query", how="outer").merge(tnrs_matches, on="query", how="outer")
contradictions = contradictions.loc[contradictions["query"].isin((unresolved_names['species_name'].unique()))]
contradictions.dropna(subset=["gnr_all_matched_name", "gnr_wfo_matched_name", "r_wfo_matched_name", "tnrs_matched_name"], how="any", inplace=True)

In [7]:
print(f"% names with contradicated matches = {np.round(len(contradictions['query'].unique())/unresolved_names.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across GNR(WFO) and GNR(ALL) = {np.round(len(contradictions.loc[contradictions.gnr_all_matched_name != contradictions.gnr_wfo_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across GNR(WFO) and R(WFO) = {np.round(len(contradictions.loc[contradictions.r_wfo_matched_name != contradictions.gnr_wfo_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across GNR(ALL) and TNRS = {np.round(len(contradictions.loc[contradictions.gnr_all_matched_name != contradictions.tnrs_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across GNR(ALL) and R(WFO) = {np.round(len(contradictions.loc[contradictions.gnr_all_matched_name != contradictions.r_wfo_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across GNR(WFO) and TNRS = {np.round(len(contradictions.loc[contradictions.gnr_wfo_matched_name != contradictions.tnrs_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")
print(f"% names with contradicated matches across R(WFO) and TNRS = {np.round(len(contradictions.loc[contradictions.r_wfo_matched_name != contradictions.tnrs_matched_name]['query'].unique())/contradictions.shape[0]*100, 2)}%")

% names with contradicated matches = 63.69%
% names with contradicated matches across GNR(WFO) and GNR(ALL) = 18.79%
% names with contradicated matches across GNR(WFO) and R(WFO) = 9.85%
% names with contradicated matches across GNR(ALL) and TNRS = 8.0%
% names with contradicated matches across GNR(ALL) and R(WFO) = 20.65%
% names with contradicated matches across GNR(WFO) and TNRS = 20.81%
% names with contradicated matches across R(WFO) and TNRS = 20.48%


In [8]:
print(f"%genus matches reported by GNR(WFO) = {np.round(resolved_names_gnr_wfo.loc[resolved_names_gnr_wfo.taxon_rank == 'genus'].shape[0] / resolved_names_gnr_wfo.shape[0]*100, 2)}%")
print(f"%genus matches reported by GNR(ALL) = {np.round(resolved_names_gnr_all.loc[resolved_names_gnr_all.taxon_rank == 'genus'].shape[0] / resolved_names_gnr_all.shape[0]*100, 2)}%")
print(f"%genus matches reported by R(WFO) = {np.round(resolved_names_r_wfo.loc[resolved_names_r_wfo.taxon_rank == 'genus'].shape[0] / resolved_names_r_wfo.shape[0]*100, 2)}%")
print(f"%genus matches reported by TNRS = {np.round(resolved_names_tnrs.loc[resolved_names_tnrs.taxon_rank == 'genus'].shape[0] / resolved_names_tnrs.shape[0]*100, 2)}%\n")

print(f"%species matches reported by GNR(WFO) = {np.round(resolved_names_gnr_wfo.loc[resolved_names_gnr_wfo.taxon_rank == 'species'].shape[0] / resolved_names_gnr_wfo.shape[0]*100, 2)}%")
print(f"%species matches reported by GNR(ALL) = {np.round(resolved_names_gnr_all.loc[resolved_names_gnr_all.taxon_rank == 'species'].shape[0] / resolved_names_gnr_all.shape[0]*100, 2)}%")
print(f"%species matches reported by R(WFO) = {np.round(resolved_names_r_wfo.loc[resolved_names_r_wfo.taxon_rank == 'species'].shape[0] / resolved_names_r_wfo.shape[0]*100, 2)}%")
print(f"%species matches reported by TNRS = {np.round(resolved_names_tnrs.loc[resolved_names_tnrs.taxon_rank == 'species'].shape[0] / resolved_names_tnrs.shape[0]*100, 2)}%\n")

print(f"%missing ranks matches reported by GNR(WFO) = {np.round(resolved_names_gnr_wfo.loc[resolved_names_gnr_wfo.taxon_rank.isna()].shape[0] / resolved_names_gnr_wfo.shape[0]*100, 2)}%")
print(f"%missing ranks matches reported by GNR(ALL) = {np.round(resolved_names_gnr_all.loc[resolved_names_gnr_all.taxon_rank.isna()].shape[0] / resolved_names_gnr_all.shape[0]*100, 2)}%")
print(f"%missing ranks matches reported by R(WFO) = {np.round(resolved_names_r_wfo.loc[resolved_names_r_wfo.taxon_rank.isna()].shape[0] / resolved_names_r_wfo.shape[0]*100, 2)}%")
print(f"%missing ranks matches reported by TNRS = {np.round(resolved_names_tnrs.loc[resolved_names_tnrs.taxon_rank.isna()].shape[0] / resolved_names_tnrs.shape[0]*100, 2)}%")

%genus matches reported by GNR(WFO) = 3.16%
%genus matches reported by GNR(ALL) = 0.4%
%genus matches reported by R(WFO) = 0.09%
%genus matches reported by TNRS = 0.0%

%species matches reported by GNR(WFO) = 96.83%
%species matches reported by GNR(ALL) = 42.02%
%species matches reported by R(WFO) = 95.17%
%species matches reported by TNRS = 97.41%

%missing ranks matches reported by GNR(WFO) = 0.0%
%missing ranks matches reported by GNR(ALL) = 56.98%
%missing ranks matches reported by R(WFO) = 0.0%
%missing ranks matches reported by TNRS = 0.0%


In [9]:
def first_two_match(n1: str, n2: str):
    n1_components = n1.lower().split(" ")
    n2_components = n2.lower().split(" ")
    if len(n1_components) > 1 and len(n2_components) > 1:
        return n1_components[0] == n2_components[0] and n1_components[1] == n2_components[1]
    return n1_components[0] == n2_components[0]

def get_lev_dist(n1: str, n2: str):
    n1_components = " ".join(n1.lower().split(" ")[:2])
    n2_components = " ".join(n2.lower().split(" ")[:2])
    d = lev(n1_components, n2_components) / max([len(n1_components), len(n2_components)])
    return d

gnr_wfo_r_wfo_contradictions = contradictions.loc[contradictions.r_wfo_matched_name != contradictions.gnr_wfo_matched_name][["query", "r_wfo_matched_name", "r_wfo_taxon_rank", "r_wfo_orig_status", "gnr_wfo_matched_name", "gnr_wfo_taxon_rank", "gnr_wfo_orig_status"]]
gnr_wfo_r_wfo_contradictions["first_two_match"] =  gnr_wfo_r_wfo_contradictions.apply(lambda row: first_two_match(n1=row["r_wfo_matched_name"], n2=row["gnr_wfo_matched_name"]), axis=1)

gnr_wfo_r_wfo_contradictions["r_wfo_query_lev_dist"] = gnr_wfo_r_wfo_contradictions.apply(lambda row: get_lev_dist(n1=row["query"], n2=row["r_wfo_matched_name"]), axis=1)
gnr_wfo_r_wfo_contradictions["gnr_wfo_query_lev_dist"] = gnr_wfo_r_wfo_contradictions.apply(lambda row: get_lev_dist(n1=row["query"], n2=row["gnr_wfo_matched_name"]), axis=1)

print(f"mean lev dist between queries and GNR(WFO) matches = {np.round(np.mean(gnr_wfo_r_wfo_contradictions.gnr_wfo_query_lev_dist), 3)}")
print(f"mean lev dist between queries and R(WFO) matches = {np.round(np.mean(gnr_wfo_r_wfo_contradictions.r_wfo_query_lev_dist), 3)}")

mean lev dist between queries and GNR(WFO) matches = 0.329
mean lev dist between queries and R(WFO) matches = 0.329


In [12]:
print(f"R(WFO):")
print(f"% names that were originally synonyms = {np.round(resolved_names_r_wfo.loc[(resolved_names_r_wfo.orig_status == 'SYNONYM')].shape[0]/resolved_names_r_wfo.shape[0]*100, 2)}%")
print(f"% names that were originally accepted = {np.round(resolved_names_r_wfo.loc[resolved_names_r_wfo.orig_status == 'ACCEPTED'].shape[0]/resolved_names_r_wfo.shape[0]*100, 2)}%")

print(f"\nGNR(WFO):")
print(f"% names that were originally synonyms = {np.round(resolved_names_gnr_wfo.loc[(resolved_names_gnr_wfo.orig_status == 'SYNONYM')].shape[0]/resolved_names_gnr_wfo.shape[0]*100, 2)}%")
print(f"% names that were originally accepted = {np.round(resolved_names_gnr_wfo.loc[resolved_names_gnr_wfo.orig_status == 'ACCEPTED'].shape[0]/resolved_names_gnr_wfo.shape[0]*100, 2)}%")

print(f"\nGNR(ALL):")
print(f"% names that were originally synonyms = {np.round(resolved_names_gnr_all.loc[(resolved_names_gnr_all.orig_status == 'SYNONYM')].shape[0]/resolved_names_gnr_all.shape[0]*100, 2)}%")
print(f"% names that were originally accepted = {np.round(resolved_names_gnr_all.loc[resolved_names_gnr_all.orig_status == 'ACCEPTED'].shape[0]/resolved_names_gnr_all.shape[0]*100, 2)}%")

print(f"\nTNRS:")
print(f"% names that were originally synonyms = {np.round(resolved_names_tnrs.loc[(resolved_names_tnrs.orig_status == 'SYNONYM')].shape[0]/resolved_names_tnrs.shape[0]*100, 2)}%")
print(f"% names that were originally accepted = {np.round(resolved_names_tnrs.loc[resolved_names_tnrs.orig_status == 'ACCEPTED'].shape[0]/resolved_names_tnrs.shape[0]*100, 2)}%")

R(WFO):
% names that were originally synonyms = 0.0%
% names that were originally accepted = 95.41%

GNR(WFO):
% names that were originally synonyms = 19.77%
% names that were originally accepted = 80.23%

GNR(ALL):
% names that were originally synonyms = 7.56%
% names that were originally accepted = 92.44%

TNRS:
% names that were originally synonyms = 7.26%
% names that were originally accepted = 92.74%


In [18]:
# determined winnder - GNR(WFO)
df1 = resolved_names_gnr_wfo[["query", "matched_name", "classification_path_ranks", "classification_path"]]
df2 = process_resolved_names(retried_gnr_wfo_resolved_names_path)[["query", "matched_name", "classification_path_ranks", "classification_path"]]
resolved_names = pd.concat([df1, df2])

In [19]:
resolved_names["genus"] = resolved_names.apply(lambda row: get_taxonomic_classification(row, rank="genus"), axis=1)
resolved_names["family"] = resolved_names.apply(lambda row: get_taxonomic_classification(row, rank="family"), axis=1)

In [21]:
df3 = process_resolved_names(retried_r_wfo_resolved_names_path)[["query", "matched_name", "genus", "family"]]
resolved_names = pd.concat([resolved_names[["query", "matched_name", "genus", "family"]], df3.loc[(~df3['query'].isin(resolved_names['query'])) & (df3.matched_name.notna())]])

In [22]:
resolved_names.loc[resolved_names.family.isna(), "genus"] = resolved_names.loc[resolved_names.family.isna(), "matched_name"].apply(lambda name: name.split(" ")[0])

In [23]:
resolved_vs_unresolved = unresolved_names.merge(resolved_names, left_on="species_name", right_on="query", how="left")
missing_names = resolved_vs_unresolved.loc[resolved_vs_unresolved.matched_name.isna()].species_name.unique()
complementary_gnr_all_data = resolved_names_gnr_all.loc[resolved_names_gnr_all['query'].isin(missing_names)][["query", "matched_name", "classification_path_ranks", "classification_path"]]
complementary_gnr_all_data["genus"] = complementary_gnr_all_data.apply(lambda row: get_taxonomic_classification(row, rank="genus"), axis=1)
complementary_gnr_all_data["family"] = complementary_gnr_all_data.apply(lambda row: get_taxonomic_classification(row, rank="family"), axis=1)

In [27]:
resolved_names = pd.concat([resolved_names, complementary_gnr_all_data[["query", "matched_name", "genus", "family"]]])

In [28]:
# quick test
ccdb_unresolved_names = pd.read_csv("./ccdb_unresolved_names.csv")
trees_unresolved_names = pd.read_csv("./trees_unresolved_names.csv")
ccdb_data = pd.read_csv("/groups/itay_mayrose/halabikeren/PloiDB/ccdb/all_data.csv")

In [29]:
print(f"%unresolved ccdb names = {np.round(ccdb_unresolved_names.loc[~ccdb_unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]/ccdb_unresolved_names.shape[0]*100,2)}% ({ccdb_unresolved_names.loc[~ccdb_unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]} names)")
print(f"%unresolved trees names = {np.round(trees_unresolved_names.loc[~trees_unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]/trees_unresolved_names.shape[0]*100,2)}% ({trees_unresolved_names.loc[~trees_unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]} names)")
print(f"%unresvoled names in total = {np.round(unresolved_names.loc[~unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]/unresolved_names.shape[0]*100,2)}% ({unresolved_names.loc[~unresolved_names.species_name.isin(resolved_names['query'].unique())].shape[0]} names)")

%unresolved ccdb names = 0.02% (83 names)
%unresolved trees names = 0.03% (118 names)
%unresvoled names in total = 0.04% (187 names)


In [30]:
print(f"% reported matched at genus level before fixing as per R(WFO) = {np.round(resolved_names.loc[~resolved_names.matched_name.str.contains(' ')].shape[0] / resolved_names.shape[0]*100, 2)}%")
genera_matches = resolved_names.loc[~resolved_names.matched_name.str.contains(" "), "query"].unique()
wfo_complementary_matches = resolved_names_r_wfo.loc[(resolved_names_r_wfo['query'].isin(genera_matches)) & (resolved_names_r_wfo.matched_name.notna())].set_index("query")["matched_name"].to_dict()
resolved_names.loc[~resolved_names.matched_name.str.contains(" "), "matched_name"] = resolved_names.loc[~resolved_names.matched_name.str.contains(" "), "query"].apply(lambda q: wfo_complementary_matches[q] if q in wfo_complementary_matches else q)
print(f"% reported matched at genus level after fixing per R(WFO) = {np.round(resolved_names.loc[~resolved_names.matched_name.str.contains(' ')].shape[0] / resolved_names.shape[0]*100, 2)}%")

% reported matched at genus level before fixing as per R(WFO) = 3.15%
% reported matched at genus level after fixing per R(WFO) = 0.07%


In [31]:
processed_resolved_names = resolved_names[["query", "matched_name", "genus", "family"]]

In [32]:
query_to_source = pd.read_csv("./unresolved_names_with_source.csv").set_index("species_name")["source"].to_dict()
processed_resolved_names["query_source"] = np.nan
processed_resolved_names.set_index("query", inplace=True)
processed_resolved_names["query_source"].fillna(value=query_to_source, inplace=True)
processed_resolved_names.reset_index(inplace=True)
processed_resolved_names.to_csv("./processed_resolved_names.csv")