In [3]:
import pandas as pd
import numpy as np
import os

import sys
sys.path.append("../../code/data_processing/")
import taxonomy

Python-dotenv could not parse statement starting at line 4


INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
networks_types = ["weighted", "binary"]
classification_db_path = "../../data/name_resolution/v.1.12/classification.txt"
name_resolution_path = "../../data/name_resolution/resolved_plant_names.csv"
output_path = "../../data/features/taxonomic_features/pollinators_taxonomic_classification.csv"
network_level_output_path = "../../data/features/taxonomic_features/net_to_pollinators_taxonomic_classification.csv"
itis_db_link = "https://www.itis.gov/downloads/itisSqlite.zip"

In [6]:
species_names_dfs = []
for networks_type in networks_types:
    networks_dir = f"../../data/networks/all/{networks_type}/"
    for path in os.listdir(networks_dir):
        if path.endswith(".csv"):
            network_path = f"{networks_dir}{path}"
            network = pd.read_csv(network_path)
            network_species = pd.DataFrame(columns=["network_type", "network_id", "original_name"])
            network_species["original_name"] = network.set_index("Plant").columns.tolist()
            network_species["network_type"] = networks_type
            network_species["network_id"] = int(path.replace(".csv", ""))
            species_names_dfs.append(network_species)
taxonomic_features = pd.concat(species_names_dfs)
taxonomic_features = taxonomic_features.loc[~taxonomic_features.original_name.str.startswith('abundance"')]
taxonomic_features.original_name = taxonomic_features.original_name.str.lower()
taxonomic_features.original_name = taxonomic_features.original_name.apply(lambda name: name.replace("_", " "))

In [8]:
taxonomic_features = taxonomy.fill_missing_data_from_itis(input_df = taxonomic_features,
                                                input_col = "original_name",
                                                db_link = itis_db_link,
                                                db_dir = os.getcwd())

IndexError: list index out of range

In [None]:
missing_data = taxonomic_features.loc[(taxonomic_features.taxon_rank.isna())]
complementary_data = taxonomy.fill_missing_data_from_ncbi(data=missing_data, search_by_col="original_name")

In [None]:
taxonomic_features.set_index("original_name", inplace=True)
complementary_data.set_index(taxonomic_features.index.name, inplace=True)
for c in taxonomic_features.columns:
    if c in complementary_data.columns:
        taxonomic_features[c].fillna(value=complementary_data[c].to_dict(), inplace=True)
taxonomic_features.reset_index(inplace=True)
if "original_name_capitalized" in taxonomic_features.columns:
    taxonomic_features.drop("original_name_capitalized", axis=1, inplace=True)

In [None]:
taxonomic_features.to_csv(output_path, index=False)

In [None]:
missing_names = taxonomic_features.query("taxon_rank.isna()").original_name.unique().tolist()
print(f"# unresolved names = {len(missing_names):,}")

In [None]:
from pygbif import species

def extract_taxonomic_data(record):
    name = record.original_name
    rank, genus, family = np.nan, np.nan, np.nan
    # try:
    taxonomic_data = species.name_suggest(q=name)
    if len(taxonomic_data) == 0:
        return rank, genus, family
    taxonomic_data = taxonomic_data[0]
    rank = taxonomic_data["rank"].lower()
    if rank == "family":
        family = name
    elif "family" in taxonomic_data:
        family = taxonomic_data["family"].lower()
    if rank == "genus":
        genus = name
    elif "genus" in taxonomic_data:
        genus = taxonomic_data["genus"].lower()
    return rank, genus, family

missing = taxonomic_features.query("taxon_rank.isna()")
missing[["taxon_rank", "genus", "family"]] = missing[["original_name"]].parallel_apply(extract_taxonomic_data, axis=1, result_type="expand")
taxonomic_features.update(missing)

In [None]:
taxonomic_features.update(missing)

In [None]:
missing_names = taxonomic_features.query("taxon_rank.isna()").original_name.unique().tolist()
print(f"# unresolved names = {len(missing_names):,}")

In [None]:
taxonomic_features.to_csv(output_path, index=False)

In [None]:
num_net_witt_missing_rank = taxonomic_features.query("taxon_rank.isna()")[["network_type",	"network_id"]].drop_duplicates().shape[0]
num_net = taxonomic_features[["network_type",	"network_id"]].drop_duplicates().shape[0]
print(f"% networks with missing pollinator ranks = {np.round(num_net_witt_missing_rank/num_net*100)} ({num_net_witt_missing_rank:,}/{num_net:,})")

In [None]:
net_to_pollinator_ranks = taxonomic_features.groupby(["network_type",	"network_id"])["taxon_rank"].apply(lambda ranks: list(ranks.dropna().unique())).reset_index()

In [None]:
net_to_pollinator_ranks["num_ranks"] = net_to_pollinator_ranks.taxon_rank.apply(lambda ranks: len(ranks))
net_to_pollinator_ranks.sort_values("num_ranks", ascending=False).query("num_ranks > 0").iloc[:719-621]

# add after the full analysis with all networks analysos noly on networks with known taxonomy

In [None]:
def get_highest_taxonomic_rank(rank):
    sorted_taxonomic_ranks = ["variety", "form", "subspecies", "species", "subgenus", "genus", "subfamily", "family", "superfamily", 
                              "suborder", "order", "infraclass", "subdivision", "subclass", "class"]
    return sorted_taxonomic_ranks.index(rank)
net_to_pollinator_ranks["highest_taxon_rank"] = net_to_pollinator_ranks.taxon_rank.apply(lambda l: sorted(l, key=get_highest_taxonomic_rank)[-1] if len(l) > 0 else np.nan)

In [None]:
net_to_pollinator_ranks.to_csv(network_level_output_path, index=False)