In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
pd.options.mode.chained_assignment = None
import numpy as np
import pickle

import matplotlib.pyplot as plt
plt.set_loglevel('WARNING')
from mycolorpy import colorlist as mcp

from tqdm import tqdm
tqdm.pandas()

import os
from ete3 import Tree
import random

import sys
sys.path.append("/groups/itay_mayrose/halabikeren/tmp/ploidb/")
from data_generation.taxonomy import add_taxonomic_data
from services.pbs_service import PBSService

sys.path.append("/groups/itay_mayrose/halabikeren/tmp/plant_pollinator_inter/data_processing/name_resolution")
from resolved_names_curator import ResolvedNamesCurator

import logging
logging.basicConfig(level=logging.INFO,format="%(asctime)s module: %(module)s function: %(funcName)s line %(lineno)d: %(message)s",handlers=[logging.StreamHandler(sys.stdout)],force=True)



INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
resolve_tree = True
resolve_ccdb = True # use taxonome name resolution

unresolved_ccdb_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/all_data.csv"
resolved_ccdb_path = f"/groups/itay_mayrose/halabikeren/PloiDB/ccdb/resolved_data_name_resolved_on_{'none' if not resolve_tree and not resolve_ccdb else ('only_ccdb' if not resolve_tree else 'ccdb_and_tree')}.csv"

tree_resolved_names_path=f"/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/processed_tree_resolved_names_name_resolution_on_{'none' if not resolve_tree and not resolve_ccdb else ('only_ccdb' if not resolve_tree else 'ccdb_and_tree')}.csv"
ccdb_resolved_names_path=f"/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/processed_ccdb_resolved_names_name_resolution_on_{'none' if not resolve_tree and not resolve_ccdb else ('only_ccdb' if not resolve_tree else 'ccdb_and_tree')}.csv"
intersection_resolved_names_path=f"/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/processed_intersection_resolved_names_{'none' if not resolve_tree and not resolve_ccdb else ('only_ccdb' if not resolve_tree else 'ccdb_and_tree')}.csv"

chromevol_input_dir = f"/groups/itay_mayrose/halabikeren/PloiDB/chromevol/by_genus_on_{'resolved' if resolve_tree else 'unresolved'}_tree_and_{'unresolved' if resolve_ccdb else 'unresolved'}_ccdb/"

In [3]:
ccdb = pd.read_csv(unresolved_ccdb_path)
ccdb.original_name = ccdb.original_name.str.lower()

In [4]:
ccdb_resolved_names = pd.read_csv(ccdb_resolved_names_path)
if resolve_tree:
    tree_resolved_names = pd.read_csv(tree_resolved_names_path)
intersection_resolved_names = pd.read_csv(intersection_resolved_names_path)

In [5]:
def comb(parsed_n: pd.Series) -> list[int]:
    values = [val.replace(" ", "").split(",") for val in parsed_n.dropna().values]
    nums = []
    for val in values:
        nums += [int(n) for n in val if pd.notna(int(n))]
    nums.sort()
    return nums

def get_single_val(records: pd.Series) -> str:
    uniques = records.dropna().unique().tolist()
    if len(uniques) == 0:
        return np.nan
    return uniques[0].lower()
        
resolved_ccdb = ccdb[["Unnamed: 0", "original_name", "parsed_n", "genus", "family"]].merge(ccdb_resolved_names[["original_name", "matched_name", "corrected_matched_name", "resolved_name", "corrected_resolved_name"]], on="original_name", how="left").drop_duplicates("Unnamed: 0", keep="first")

relevant_resolved_ccdb = resolved_ccdb.loc[(resolved_ccdb.corrected_resolved_name.isin(intersection_resolved_names.mapped_name)) & (resolved_ccdb.parsed_n.notna())]
relevant_resolved_ccdb["mapped_name"] = relevant_resolved_ccdb["corrected_resolved_name"]
  
relevant_matched_ccdb = resolved_ccdb.loc[(resolved_ccdb.corrected_matched_name.isin(intersection_resolved_names.mapped_name)) & (resolved_ccdb.parsed_n.notna())]
relevant_matched_ccdb["mapped_name"] = relevant_matched_ccdb["corrected_matched_name"]
                               
relevant_resolved_ccdb = pd.concat([relevant_resolved_ccdb, relevant_matched_ccdb])
relevant_resolved_ccdb = relevant_resolved_ccdb.loc[~relevant_resolved_ccdb["Unnamed: 0"].duplicated(keep='first')]

In [6]:
intersected_resolved_names = set(intersection_resolved_names.mapped_name.tolist())
print(f"{relevant_resolved_ccdb.shape[0]:,} records of {len(intersected_resolved_names):,} mapped names have chromosome number data")

ccdb_by_resolved_name = relevant_resolved_ccdb.groupby("mapped_name").agg({"original_name": lambda x: ";".join(x.unique().tolist()),
                                                                           "matched_name": lambda x: ";".join(x.unique().tolist()),
                                                                           "parsed_n": comb,
                                                                           "corrected_matched_name": get_single_val,
                                                                           "corrected_resolved_name": get_single_val,
                                                                           "genus": get_single_val,
                                                                           "family": get_single_val}).rename(columns={"original_name": "original_names",
                                                                                                                     "matched_name": "matched_names"}).reset_index()
ccdb_by_resolved_name["parsed_n_mean"] = ccdb_by_resolved_name["parsed_n"].apply(np.mean)
ccdb_by_resolved_name["parsed_n_median"] = ccdb_by_resolved_name["parsed_n"].apply(lambda parsed_n: np.median(parsed_n) if len(parsed_n) % 2 != 0 else parsed_n[np.max([0, len(parsed_n)//2-1])])
ccdb_by_resolved_name["parsed_n_min"] = ccdb_by_resolved_name["parsed_n"].apply(np.min)
ccdb_by_resolved_name["parsed_n_max"] = ccdb_by_resolved_name["parsed_n"].apply(np.max)
ccdb_by_resolved_name["parsed_n_std"] = ccdb_by_resolved_name["parsed_n"].apply(np.std)
ccdb_by_resolved_name.sort_values("parsed_n_std", ascending=False, inplace=True)

276,025 records of 56,511 mapped names have chromosome number data


In [7]:
if resolve_ccdb and ccdb_by_resolved_name.loc[(ccdb_by_resolved_name.genus.isna()) | (ccdb_by_resolved_name.family.isna())].shape[0] > 0:
    ccdb_by_resolved_name = add_taxonomic_data(input_df=ccdb_by_resolved_name, input_col="corrected_resolved_name", itis_db_dir=os.path.dirname(resolved_ccdb_path))
else:
    ccdb_by_resolved_name.set_index("mapped_name", inplace=True)
    ccdb_by_resolved_name["genus"].fillna(value=relevant_resolved_ccdb.set_index("mapped_name")["genus"].to_dict(), inplace=True)
    ccdb_by_resolved_name["family"].fillna(value=relevant_resolved_ccdb.set_index("mapped_name")["family"].to_dict(), inplace=True)
    ccdb_by_resolved_name.reset_index(inplace=True)
ccdb_by_resolved_name.to_csv(resolved_ccdb_path, index=False)

In [8]:
ccdb_by_resolved_name["num_original_names"] = ccdb_by_resolved_name.original_names.apply(lambda names: names.count(";")+1)
ccdb_by_resolved_name["num_matched_names"] = ccdb_by_resolved_name.matched_names.apply(lambda names: names.count(";")+1)
ccdb_by_resolved_name["parsed_n_larger_than_200"] = ccdb_by_resolved_name.parsed_n.apply(lambda n_vals: np.any([n > 200 for n in n_vals]))
ccdb_by_resolved_name.sort_values(["parsed_n_std", "num_original_names"], ascending=[False, False], inplace=True)
ccdb_by_resolved_name.loc[ccdb_by_resolved_name.parsed_n_larger_than_200][["mapped_name", "original_names", "num_matched_names", "parsed_n", "parsed_n_median", "parsed_n_std"]].to_csv(f"flagged_cases_as_ccdb_potential_errors.csv", index=False)

In [9]:
ccdb_by_resolved_name.loc[ccdb_by_resolved_name.parsed_n_larger_than_200][["mapped_name", "original_names", "num_matched_names", "parsed_n", "parsed_n_median", "parsed_n_std"]].head()

Unnamed: 0,mapped_name,original_names,num_matched_names,parsed_n,parsed_n_median,parsed_n_std
0,croptilon rigidifolium,croptilon rigidifolius (e.. b. sm.) e.. b. sm.;croptilon rigidifolium,2,"[5, 5, 5, 505]",5.0,216.506351
1,hymenothrix dissecta,bahia dissecta (a. gray) britton;amauriopsis dissecta (a. gray) rydb.;bahia dissecta (gray) britton;bahia dissecta (a. gray) britt.,2,"[18, 18, 18, 18, 18, 18, 18, 361]",18.0,113.436587
2,euphorbia antiquorum,euphorbia antiquorum l.;euphorbia antiquorwn l.;euphorbia antiquorum linn.,2,"[30, 30, 30, 30, 30, 30, 350]",30.0,111.976674
3,graptopetalum pachyphyllum,graptopetalum pachyphyllum rose;graptopetalum pachyphyllum,1,"[30, 30, 30, 170, 175, 193, 208, 270, 270]",175.0,93.192089
4,pippenalia delphiniifolia,pippenalia delphiniifolia (rydb.) mcvaugh,1,"[60, 215]",60.0,77.5


In [10]:
debug_data = relevant_resolved_ccdb[["mapped_name", "original_name", "parsed_n", "genus"]]
debug_data.parsed_n = debug_data.parsed_n.apply(lambda x: x.replace(" ", "").split(","))
debug_data = debug_data.explode("parsed_n")
debug_data.parsed_n = debug_data.parsed_n.astype(np.int16)
debug_data_by_genus = debug_data.groupby("genus")

In [11]:
genera = list(debug_data_by_genus.groups.keys())
variable_genera = []
for genus in genera:
    if len(debug_data_by_genus.get_group(genus).parsed_n.unique()) > 1:
        variable_genera.append(genus)

In [12]:
debug_data_by_genus.get_group(variable_genera[1])

Unnamed: 0,mapped_name,original_name,parsed_n,genus
136989,zabelia triflora,abelia triflora r. brown in wallich,9,Abelia
136990,zabelia triflora,abelia triflora r. brown in wallich,9,Abelia
136991,zabelia triflora,abelia triflora r. brown in wallich,9,Abelia
136992,zabelia triflora,abelia triflora r. brown in wallich,18,Abelia
137114,zabelia dielsii,abelia coreana nakai,18,Abelia
137115,zabelia dielsii,abelia coreana nakai,54,Abelia
137180,abelia grandiflora,abelia grandiflora l.,16,Abelia
137188,linnaea chinensis,abelia chinensis r. br.,16,Abelia
137189,linnaea chinensis,abelia chinensis r. br.,16,Abelia
137195,linnaea parvifolia,abelia parvifolia hemsl.,16,Abelia


In [13]:
for genus in debug_data_by_genus.groups.keys():
    genus_data = debug_data_by_genus.get_group(genus)
    genus_data.sort_values(["parsed_n", "mapped_name"], inplace=True)
    num_mapped_names = len(genus_data.mapped_name.unique())
    genus_dir = f"{chromevol_input_dir}{genus.lower()}/"
    fig_path = f"{genus_dir}counts_distribution.jpeg"
    if num_mapped_names < 7 or not os.path.exists(genus_dir):
        continue
    elif not os.path.exists(fig_path):
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20,num_mapped_names*0.25))
        print(f"# mapped name in genus {genus} = {num_mapped_names}")
        colors = tuple(mcp.gen_color(cmap="bwr",n=num_mapped_names))
        for (i, d), color in zip(genus_data.groupby('mapped_name'), colors):
            d['parsed_n'].hist(alpha=0.7, ax=ax, label=i, color=color)
        xticks = genus_data.parsed_n.unique().tolist()
        xticks.sort()
        ax.set_xticks(xticks)
        ax.legend(loc="lower right", bbox_to_anchor=(1.2,0))
        plt.tight_layout()
        fig.savefig(fig_path)