In [None]:
import pandas as pd
import numpy as np
import pickle

from tqdm import tqdm
tqdm.pandas()

import os
from ete3 import Tree
import random

import sys
sys.path.append("/groups/itay_mayrose/halabikeren/tmp/ploidb/")
from data_generation.taxonomy import add_taxonomic_data
from data_processing.check_tree_monophyly import get_largest_monophyletic_group

In [None]:
unresolved_ccdb_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/all_data.csv"
resolved_ccdb_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/resolved_data.csv"
resolved_ccdb_orig_names_path = "./tnrs_origs_resolved.pkl"
taxonomy_filling_script_path = "/groups/itay_mayrose/halabikeren/tmp/plant_pollinator_inter/get_taxonomic_data.py"
taxonomy_filling_log_path = "/groups/itay_mayrose/halabikeren/PloiDB/ccdb/taxonomy_filling.log"

In [None]:
ccdb = pd.read_csv(unresolved_ccdb_path)
ccdb.original_name = ccdb.original_name.str.lower()

In [None]:
ccdb_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_ccdb.csv"
tree_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_tree.csv"
intersection_resolved_names_path = "/groups/itay_mayrose/anatshafir1/ploidDB/rotl/ALLOTB/stats/resolved_name_mapping_intersection.csv"

In [None]:
def correct_name(name: str) -> str:
    name = name.split("(")[0]
    name = name.split(",")[0]
    name = name.split(".")[0]
    if name.endswith(" "):
        name = name[:-1]
    return name

def process_resolved_names(path: str) -> pd.DataFrame:
    resolved_names = pd.read_csv(path).drop("Unnamed: 0", axis=1)
    resolved_names.rename(columns={"search_string": "corrected_original_name", 
                                   "unique_name": "resolved_name"}, inplace=True)
    resolved_names.original_name = resolved_names.original_name.str.lower()
    resolved_names.corrected_original_name = resolved_names.corrected_original_name.str.lower()
    resolved_names.resolved_name = resolved_names.resolved_name.str.lower()
    resolved_names["corrected_resolved_name"] = resolved_names.resolved_name.apply(correct_name)
    resolved_names.sort_values("ott_id", inplace=True)
    resolved_names = resolved_names[["original_name", "corrected_original_name", "resolved_name", "corrected_resolved_name", "ott_id"]]
    return resolved_names

ccdb_resolved_names = process_resolved_names(path=ccdb_resolved_names_path)
intersection_resolved_names = process_resolved_names(path=intersection_resolved_names_path)

In [None]:
def comb(parsed_n: pd.Series) -> list[int]:
    values = [val.replace(" ", "").split(",") for val in parsed_n.dropna().values]
    nums = []
    for val in values:
        nums += [int(n) for n in val if pd.notna(int(n))]
    nums.sort()
    return nums

def get_single_val(records: pd.Series) -> str:
    uniques = records.dropna().unique().tolist()
    if len(uniques) == 0:
        return np.nan
    if len(uniques) > 1:
        return ','.join(uniques).lower()
    return uniques[0].lower()
        
resolved_ccdb = ccdb[["original_name", "parsed_n"]].merge(ccdb_resolved_names[["original_name", "corrected_resolved_name"]], on="original_name", how="left")
relevant_resolved_ccdb = resolved_ccdb.loc[(resolved_ccdb.corrected_resolved_name.isin(intersection_resolved_names.corrected_resolved_name)) & (resolved_ccdb.parsed_n.notna())]
with open(resolved_ccdb_orig_names_path, "wb") as out:
    pickle.dump(file=out, obj=set(resolved_ccdb.original_name.tolist()))

intersected_resolved_names = set(intersection_resolved_names.resolved_name.tolist())
resolved_names_with_cn = set(relevant_resolved_ccdb.corrected_resolved_name.tolist())
resolved_names_without_cn = [name for name in intersected_resolved_names if name not in resolved_names_with_cn]
print(f"{len(resolved_names_without_cn):,} names our of {len(intersected_resolved_names):,} have no chromosome number data, leaving us with {len(resolved_names_with_cn):,} names")

ccdb_by_resolved_name = relevant_resolved_ccdb.groupby("corrected_resolved_name").agg({"original_name": lambda x: x.unique().tolist(), 
                                                                              "parsed_n": comb}).rename(columns={"original_name": "original_names",
                                                                                                                      "genus": "taxonome_genus",
                                                                                                                      "family": "taxonome_family"}).reset_index()
ccdb_by_resolved_name["parsed_n_mean"] = ccdb_by_resolved_name["parsed_n"].apply(np.mean)
ccdb_by_resolved_name["parsed_n_median"] = ccdb_by_resolved_name["parsed_n"].apply(np.median)
ccdb_by_resolved_name["parsed_n_min"] = ccdb_by_resolved_name["parsed_n"].apply(np.min)
ccdb_by_resolved_name["parsed_n_max"] = ccdb_by_resolved_name["parsed_n"].apply(np.max)
ccdb_by_resolved_name["parsed_n_std"] = ccdb_by_resolved_name["parsed_n"].apply(np.std)
ccdb_by_resolved_name.sort_values("parsed_n_std", ascending=False, inplace=True)

In [None]:
"callicarpa macrophylla (species in kingdom archaeplastida)" in set(intersection_resolved_names.resolved_name)

In [10]:
import logging
logging.basicConfig(level=logging.INFO,format="%(asctime)s module: %(module)s function: %(funcName)s line %(lineno)d: %(message)s",handlers=[logging.StreamHandler(sys.stdout)],force=True)

In [None]:
ccdb_by_resolved_name = add_taxonomic_data(input_df=ccdb_by_resolved_name, input_col="resolved_name", itis_db_dir=os.path.dirname(resolved_ccdb_path))
# ccdb_by_resolved_name.to_csv(resolved_ccdb_path, index=False)

2022-06-09 19:22:32,163 module: taxonomy function: get_taxonomic_data line 40: % names covered by db taxonomic data = 0.0%


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11841), Label(value='0 / 11841')))…

Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.
Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to spe