In [1]:
import pandas as pd
import numpy as np
import os

from Bio import Entrez
Entrez.email = "halabikeren@gmail.com"

In [2]:
networks_types = ["weighted", "binary"]
classification_db_path = "../PloiDB/name_resolution/final_name_resolution/taxonome/WFO/v.2022.07/classification.txt"
name_resolution_path = "../data/name_resolution/resolved_plant_names.csv"
networks_metadata_path = "../data/networks/all/networks_metadata.csv"
taxonomic_features_path = "../data/features/taxonomic_features/taxonomic_features.csv"

In [3]:
species_names_dfs = []
for networks_type in networks_types:
    networks_dir = f"../data/networks/all/{networks_type}/"
    for path in os.listdir(networks_dir):
        if path.endswith(".csv"):
            network_path = f"{networks_dir}{path}"
            network = pd.read_csv(network_path)
            network_species = pd.DataFrame(columns=["network_type", "network_id", "original_name"])
            network_species["original_name"] = network.Plant.tolist()
            network_species["network_type"] = networks_type
            network_species["network_id"] = int(path.replace(".csv", ""))
            species_names_dfs.append(network_species)
taxonomic_features = pd.concat(species_names_dfs)
taxonomic_features = taxonomic_features.loc[~taxonomic_features.original_name.str.startswith('abundance"')]
taxonomic_features.original_name = taxonomic_features.original_name.str.lower()
taxonomic_features.original_name = taxonomic_features.original_name.apply(lambda name: name.replace("_", " "))

In [4]:
name_resolution = pd.read_csv(name_resolution_path)
name_resolution.original_name = name_resolution.original_name.str.lower()
name_resolution.resolved_name = name_resolution.resolved_name.str.lower()

taxonomic_features["resolved_name"] = np.nan
taxonomic_features.set_index("original_name", inplace=True)
taxonomic_features.resolved_name.fillna(value=name_resolution.set_index("original_name")["resolved_name"].to_dict(), inplace=True)
taxonomic_features.reset_index(inplace=True)

In [5]:
classification_data = pd.read_csv(classification_db_path, sep="\t")
classification_data.scientificName = classification_data.scientificName.str.lower()

# join by original name to scientificName
taxonomic_features.set_index("original_name", inplace=True)
classification_data.set_index("scientificName", inplace=True)
taxonomic_features = taxonomic_features.join(classification_data)

cols_to_keep = ["network_type", "network_id", "original_name", "resolved_name", "taxonRank", "genus", "family"]
cols_to_drop = [c for c in taxonomic_features.columns if c not in cols_to_keep]
taxonomic_features.drop(cols_to_drop, axis=1, inplace=True)
taxonomic_features.reset_index(inplace=True)
taxonomic_features.rename(columns={"taxonRank": "taxonomic_rank", "index": "original_name"}, inplace=True)

  classification_data = pd.read_csv("/groups/itay_mayrose/halabikeren/PloiDB/name_resolution/final_name_resolution/taxonome/WFO/v.2022.07/classification.txt", sep="\t")


In [6]:
def get_genus(name):
    if len(name) == 1:
        return np.nan
    components = name.split(" ")
    if "." in components[0]:
        return np.nan
    return components[0]

taxonomic_features.loc[taxonomic_features.taxonomic_rank == "GENUS", "genus"] = taxonomic_features.loc[taxonomic_features.taxonomic_rank == "GENUS", "original_name"]
taxonomic_features.loc[taxonomic_features.taxonomic_rank == "FAMILY", "family"] = taxonomic_features.loc[taxonomic_features.taxonomic_rank == "FAMILY", "original_name"]
taxonomic_features.loc[(taxonomic_features.genus.isna()) & (taxonomic_features.resolved_name.notna()), "genus"] = taxonomic_features.loc[(taxonomic_features.genus.isna()) & (taxonomic_features.resolved_name.notna()), "resolved_name"].apply(lambda name: name.split(" ")[0])
taxonomic_features.loc[taxonomic_features.genus.isna(), "genus"] = taxonomic_features.loc[taxonomic_features.genus.isna(), "original_name"].apply(get_genus)

genus_to_family = taxonomic_features[["genus", "family"]].drop_duplicates().dropna().set_index("genus")["family"].to_dict()
taxonomic_features.set_index("genus", inplace=True)
taxonomic_features.family.fillna(value=genus_to_family, inplace=True)
taxonomic_features.reset_index(inplace=True)                              

In [7]:
taxonomic_features.isna().sum()

genus              220
original_name        0
network_type         0
network_id           0
resolved_name     3629
taxonomic_rank    2088
family            1149
dtype: int64

In [9]:
# fill missing genera data using Entrez
missing_genera = taxonomic_features.loc[taxonomic_features.family.isna()].genus.dropna().unique().tolist()
genus_to_id = dict()
for genus in missing_genera:
    try:
        genus_to_id[genus] = Entrez.read(Entrez.esearch(term=genus, db="taxonomy", retmode="xml"))["IdList"][0]
    except Exception as e:
        print(f"could not find id for genus {genus} due to error {e}")

could not find id for genus abundance" due to error list index out of range
could not find id for genus adehis due to error list index out of range
could not find id for genus adhatoda due to error list index out of range
could not find id for genus agoceris due to error list index out of range
could not find id for genus alium due to error list index out of range
could not find id for genus allibertia due to error list index out of range
could not find id for genus appolonias due to error list index out of range
could not find id for genus armcae due to error list index out of range
could not find id for genus as due to error list index out of range
could not find id for genus ascerates due to error list index out of range
could not find id for genus befaria due to error list index out of range
could not find id for genus benthamida due to error list index out of range
could not find id for genus besella due to error list index out of range
could not find id for genus besmoscelis due 

In [10]:
def get_family(genus_id):
    d = list(Entrez.parse(Entrez.efetch(id=genus_id, db="taxonomy", retmode="xml")))[0]
    family = [item for item in d["LineageEx"] if item["Rank"] == "family"][0]["ScientificName"]
    return family

failed_genera = []
genus_to_family = dict()
for genus in genus_to_id:
    try:
        genus_to_family[genus] = get_family(genus_to_id[genus])
    except Exception as e:
        print(f"failed to get family for genus {genus} due to error {e}")
        failed_genera.append(genus)
# all appear to be pollinator families coming from the unreversed network binary/31

failed to get family for genus chalcidoidea due to error list index out of range
failed to get family for genus chironomidae due to error list index out of range
failed to get family for genus cleridae due to error list index out of range
failed to get family for genus crabronidae due to error list index out of range
failed to get family for genus curculionidae due to error list index out of range
failed to get family for genus geocoridae due to error list index out of range
failed to get family for genus magnoliidae due to error list index out of range
failed to get family for genus milichiidae due to error list index out of range
failed to get family for genus mordellidae due to error list index out of range
failed to get family for genus phalacridae due to error list index out of range
failed to get family for genus pompilidae due to error list index out of range
failed to get family for genus tachinidae due to error list index out of range
failed to get family for genus unidentifie

In [11]:
taxonomic_features.set_index("genus", inplace=True)
taxonomic_features.family.fillna(genus_to_family, inplace=True)
taxonomic_features.reset_index(inplace=True)
taxonomic_features.isna().sum()

genus              220
original_name        0
network_type         0
network_id           0
resolved_name     3629
taxonomic_rank    2088
family             401
dtype: int64

In [12]:
taxonomic_features.to_csv("./taxonomic_features.csv", index=False)

In [79]:
binary_plant_features_path = "../data/features/plant_features/binary/plant_features_with_classification.csv"
weighted_plant_features_path = "../data/features/plant_features/weighted/plant_features_with_classification.csv"

binary_plant_features = pd.read_csv(binary_plant_features_path)
weighted_plant_features = pd.read_csv(weighted_plant_features_path)

In [80]:
name_to_genus = taxonomic_features.set_index("original_name")["genus"].to_dict()
name_to_family = taxonomic_features.set_index("original_name")["family"].to_dict()

In [81]:
taxonomic_features.set_index("original_name", inplace=True)
binary_plant_features.set_index("original_name", inplace=True)
weighted_plant_features.set_index("original_name", inplace=True)

binary_plant_features[["genus", "family"]] = np.nan
binary_plant_features["genus"].fillna(name_to_genus, inplace=True)
binary_plant_features["family"].fillna(name_to_family, inplace=True)

weighted_plant_features[["genus", "family"]] = np.nan
weighted_plant_features["genus"].fillna(name_to_genus, inplace=True)
weighted_plant_features["family"].fillna(name_to_family, inplace=True)

taxonomic_features.reset_index(inplace=True)
binary_plant_features.reset_index(inplace=True)
weighted_plant_features.reset_index(inplace=True)

In [86]:
binary_plant_features.to_csv(binary_plant_features_path)
weighted_plant_features.to_csv(weighted_plant_features_path)

In [2]:
taxonomic_features = pd.read_csv(taxonomic_features_path)
taxonomic_features.loc[taxonomic_features.genus == "apidae"]

Unnamed: 0,genus,original_name,network_type,network_id,resolved_name,taxonomic_rank,family
1070,apidae,apidae,binary,31,,,


In [5]:
taxonomic_features.loc[(taxonomic_features.network_id == 31) & (taxonomic_features.network_type == "binary")]

Unnamed: 0,genus,original_name,network_type,network_id,resolved_name,taxonomic_rank,family
533,alaus,alaus,binary,31,,,Elateridae
697,amata,amata,binary,31,,,Erebidae
698,amblypodia,amblypodia,binary,31,,,Lycaenidae
702,amegilla,amegilla,binary,31,,,Apidae
1070,apidae,apidae,binary,31,,,
...,...,...,...,...,...,...,...
15397,thyreus,thyreus,binary,31,,,Apidae
15850,trigona,trigona,binary,31,,,Apidae
15919,trogaspidia,trogaspidia,binary,31,,,Mutillidae
16238,vespa,vespa,binary,31,,,Vespidae


In [23]:
networks_with_missing_taxonomy_data = taxonomic_features.groupby(["network_type", "network_id"])[["taxonomic_rank", "resolved_name"]].agg({"taxonomic_rank": lambda fams: 1 if len(fams.dropna()) > 0 else 0,
                                                                                                                                           "resolved_name": lambda fams: 1 if len(fams.dropna()) > 0 else 0}).reset_index()

In [52]:
# reversed:
# binary: 31, 

# marine: 
# binary: 46, 48, 50, 51, 54, 55, 56, 57, 61, 62, 63, 64, 66, 70, 71, 72, 73

# in codes:
# weighted: 106, 107, 156, 157, 158, 159

In [65]:
poll_names_dfs = []
for networks_type in networks_types:
    networks_dir = f"/groups/itay_mayrose/halabikeren/plant_pollinator_networks/networks/all/{networks_type}/"
    for path in os.listdir(networks_dir):
        if path.endswith(".csv"):
            network_path = f"{networks_dir}{path}"
            network = pd.read_csv(network_path)
            network_species = pd.DataFrame(columns=["network_type", "network_id", "original_name"])
            network_species["original_name"] = [p for p in network.columns.tolist() if p != "Plant"]
            network_species["network_type"] = networks_type
            network_species["network_id"] = int(path.replace(".csv", ""))
            poll_names_dfs.append(network_species)
poll_names = pd.concat(poll_names_dfs)
poll_names["processed_name"] = poll_names.original_name.apply(lambda name: np.nan if "unidentified" in name.lower() else name.replace("  sp.",""))

In [67]:
species_names = poll_names.query("processed_name.str.contains(' ', na=False)")
genus_names = poll_names.query("not processed_name.str.contains(' ', na=False)")
unidetified_names = poll_names.query("not processed_name.isna()")

In [69]:
print(f"# pollinator documented as species = {species_names.shape[0]:,}")
print(f"# pollinator documented as genera = {genus_names.shape[0]:,}")
print(f"# pollinator documented vaguely = {unidetified_names.shape[0]:,}")

# pollinator documented as species = 24,741
# pollinator documented as genera = 9,017
# pollinator documented vaguely = 31,169


In [3]:
net_metadata = pd.read_csv(networks_metadata_path)

In [5]:
taxonomic_data = pd.read_csv(taxonomic_features_path)

In [7]:
taxonomic_data.query("taxonomic_rank.isna()")

Unnamed: 0,genus,original_name,network_type,network_id,resolved_name,taxonomic_rank,family
0,,1,binary,36,,,
1,,2,binary,36,,,
2,,a. sphaerocephalon,weighted,8,,,
3,,a. sphaerocephalon,weighted,9,,,
5,abelia,abelia grandiflora,weighted,518,abelia grandiflora,,Caprifoliaceae
...,...,...,...,...,...,...,...
16598,zizyphus,zizyphus,binary,15,,,Rhamnaceae
16601,,zuc.pun,weighted,156,,,
16602,,zuc.pun,weighted,157,,,
16603,,zuc.pun,weighted,158,,,
