In [1]:
import pandas as pd
import numpy as np
import os

import sys
sys.path.append("../../code/data_generation/")
import taxonomy

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
networks_types = ["weighted", "binary"]
classification_db_path = "../PloiDB/name_resolution/final_name_resolution/taxonome/WFO/v.2022.07/classification.txt"
name_resolution_path = "../data/name_resolution/resolved_plant_names.csv"
output_path = "../data/features/taxonomic_features/pollinators_taxonomic_classification.csv"
network_level_output_path = "../data/features/taxonomic_features/net_to_pollinators_taxonomic_classification.csv"
itis_db_link = "https://www.itis.gov/downloads/itisSqlite.zip"

In [3]:
species_names_dfs = []
for networks_type in networks_types:
    networks_dir = f"../data/networks/all/{networks_type}/"
    for path in os.listdir(networks_dir):
        if path.endswith(".csv"):
            network_path = f"{networks_dir}{path}"
            network = pd.read_csv(network_path)
            network_species = pd.DataFrame(columns=["network_type", "network_id", "original_name"])
            network_species["original_name"] = network.set_index("Plant").columns.tolist()
            network_species["network_type"] = networks_type
            network_species["network_id"] = int(path.replace(".csv", ""))
            species_names_dfs.append(network_species)
taxonomic_features = pd.concat(species_names_dfs)
taxonomic_features = taxonomic_features.loc[~taxonomic_features.original_name.str.startswith('abundance"')]
taxonomic_features.original_name = taxonomic_features.original_name.str.lower()
taxonomic_features.original_name = taxonomic_features.original_name.apply(lambda name: name.replace("_", " "))

In [4]:
taxonomic_features = taxonomy.fill_missing_data_from_itis(input_df = taxonomic_features,
                                                input_col = "original_name",
                                                db_link = itis_db_link,
                                                db_dir = os.getcwd())

In [6]:
missing_data = taxonomic_features.loc[(taxonomic_features.taxon_rank.isna())]
complementary_data = taxonomy.fill_missing_data_from_ncbi(data=missing_data, search_by_col="original_name")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6734), Label(value='0 / 6734'))), …

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["ncbi_tax_id"] = data[search_by_col].parallel_apply(get_ncbi_tax_id)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["genus"].fillna(value=tax_id_to_genus, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["family"].fillna(value=tax_id_to_family, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6734), Label(value='0 / 6734'))), …

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.genus = data.parallel_apply(lambda record: record[search_by_col] if record.taxon_rank == "genus" else np.nan, axis=1)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6734), Label(value='0 / 6734'))), …

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.family = data.parallel_apply(lambda record: record[search_by_col] if record.taxon_rank == "family" else np.nan, axis=1)


In [10]:
taxonomic_features.set_index("original_name", inplace=True)
complementary_data.set_index(taxonomic_features.index.name, inplace=True)
for c in taxonomic_features.columns:
    if c in complementary_data.columns:
        taxonomic_features[c].fillna(value=complementary_data[c].to_dict(), inplace=True)
taxonomic_features.reset_index(inplace=True)
if "original_name_capitalized" in taxonomic_features.columns:
    taxonomic_features.drop("original_name_capitalized", axis=1, inplace=True)

In [13]:
taxonomic_features.to_csv(output_path, index=False)

In [4]:
missing_names = taxonomic_features.query("taxon_rank.isna()").original_name.unique().tolist()
print(f"# unresolved names = {len(missing_names):,}")

# unresolved names = 14,414


In [12]:
from pygbif import species

def extract_taxonomic_data(record):
    name = record.original_name
    rank, genus, family = np.nan, np.nan, np.nan
    # try:
    taxonomic_data = species.name_suggest(q=name)
    if len(taxonomic_data) == 0:
        return rank, genus, family
    taxonomic_data = taxonomic_data[0]
    rank = taxonomic_data["rank"].lower()
    if rank == "family":
        family = name
    elif "family" in taxonomic_data:
        family = taxonomic_data["family"].lower()
    if rank == "genus":
        genus = name
    elif "genus" in taxonomic_data:
        genus = taxonomic_data["genus"].lower()
    return rank, genus, family

missing = taxonomic_features.query("taxon_rank.isna()")
missing[["taxon_rank", "genus", "family"]] = missing[["original_name"]].parallel_apply(extract_taxonomic_data, axis=1, result_type="expand")
taxonomic_features.update(missing)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=6469), Label(value='0 / 6469'))), …

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing[["taxon_rank", "genus", "family"]] = missing[["original_name"]].parallel_apply(extract_taxonomic_data, axis=1, result_type="expand")


NameError: name 'taxonomic_feautres' is not defined

In [13]:
taxonomic_features.update(missing)

In [14]:
missing_names = taxonomic_features.query("taxon_rank.isna()").original_name.unique().tolist()
print(f"# unresolved names = {len(missing_names):,}")

# unresolved names = 8,325


In [15]:
taxonomic_features.to_csv(output_path, index=False)

In [9]:
num_net_witt_missing_rank = taxonomic_features.query("taxon_rank.isna()")[["network_type",	"network_id"]].drop_duplicates().shape[0]
num_net = taxonomic_features[["network_type",	"network_id"]].drop_duplicates().shape[0]
print(f"% networks with missing pollinator ranks = {np.round(num_net_witt_missing_rank/num_net*100)} ({num_net_witt_missing_rank:,}/{num_net:,})")

% networks with missing pollinator ranks = 86.0 (621/719)


In [10]:
net_to_pollinator_ranks = taxonomic_features.groupby(["network_type",	"network_id"])["taxon_rank"].apply(lambda ranks: list(ranks.dropna().unique())).reset_index()

In [11]:
net_to_pollinator_ranks["num_ranks"] = net_to_pollinator_ranks.taxon_rank.apply(lambda ranks: len(ranks))
net_to_pollinator_ranks.sort_values("num_ranks", ascending=False).query("num_ranks > 0").iloc[:719-621]

Unnamed: 0,network_type,network_id,taxon_rank,num_ranks
20,binary,20.0,"[genus, family, superfamily, species, order]",5
8,binary,8.0,"[genus, subfamily, family, order, species]",5
122,binary,122.0,"[species, subspecies, infraclass, genus]",4
23,binary,23.0,"[genus, order, family, superfamily]",4
21,binary,21.0,"[genus, species, family, order]",4
...,...,...,...,...
187,weighted,63.0,"[species, form]",2
595,weighted,471.0,"[species, subspecies]",2
189,weighted,65.0,"[species, genus]",2
190,weighted,66.0,"[species, genus]",2


In [None]:
# add after the full analysis with all networks analysos noly on networks with known taxonomy

In [30]:
def get_highest_taxonomic_rank(rank):
    sorted_taxonomic_ranks = ["variety", "form", "subspecies", "species", "subgenus", "genus", "subfamily", "family", "superfamily", 
                              "suborder", "order", "infraclass", "subdivision", "subclass", "class"]
    return sorted_taxonomic_ranks.index(rank)
net_to_pollinator_ranks["highest_taxon_rank"] = net_to_pollinator_ranks.taxon_rank.apply(lambda l: sorted(l, key=get_highest_taxonomic_rank)[-1] if len(l) > 0 else np.nan)

In [32]:
net_to_pollinator_ranks.to_csv(network_level_output_path, index=False)