In [6]:
import pandas as pd
import numpy as np
import os

from Bio import Entrez
Entrez.email = "halabikeren@gmail.com"

In [8]:
networks_types = ["weighted", "binary"]
classification_db_path = "../../data/name_resolution/wfo_20230107/classification.txt"
name_resolution_path = "../../data/name_resolution/resolved_plant_names.csv"
networks_metadata_path = "../../data/networks/all/networks_metadata.csv"
taxonomic_features_path = "../../data/features/taxonomic_features/taxonomic_features.csv"
all_networks_dir = f"../../data/networks/all/"

In [9]:
species_names_dfs = []
for networks_type in networks_types:
    networks_dir = f"{all_networks_dir}/{networks_type}/"
    for path in os.listdir(networks_dir):
        if path.endswith(".csv"):
            network_path = f"{networks_dir}{path}"
            network = pd.read_csv(network_path)
            network_species = pd.DataFrame(columns=["network_type", "network_id", "original_name"])
            network_species["original_name"] = network.Plant.tolist()
            network_species["network_type"] = networks_type
            network_species["network_id"] = int(path.replace(".csv", ""))
            species_names_dfs.append(network_species)
taxonomic_features = pd.concat(species_names_dfs)
taxonomic_features = taxonomic_features.loc[~taxonomic_features.original_name.str.startswith('abundance"')]
taxonomic_features.original_name = taxonomic_features.original_name.str.lower()
taxonomic_features.original_name = taxonomic_features.original_name.apply(lambda name: name.replace("_", " "))

In [10]:
name_resolution = pd.read_csv(name_resolution_path)
name_resolution.original_name = name_resolution.original_name.str.lower()
name_resolution.resolved_name = name_resolution.resolved_name.str.lower()

taxonomic_features["resolved_name"] = np.nan
taxonomic_features.set_index("original_name", inplace=True)
taxonomic_features.resolved_name.fillna(value=name_resolution.set_index("original_name")["resolved_name"].to_dict(), inplace=True)
taxonomic_features.reset_index(inplace=True)

In [11]:
classification_data = pd.read_csv(classification_db_path, sep="\t")
classification_data.scientificName = classification_data.scientificName.str.lower()

# join by original name to scientificName
taxonomic_features.set_index("original_name", inplace=True)
classification_data.set_index("scientificName", inplace=True)
taxonomic_features = taxonomic_features.join(classification_data)

cols_to_keep = ["network_type", "network_id", "original_name", "resolved_name", "taxonRank", "genus", "family"]
cols_to_drop = [c for c in taxonomic_features.columns if c not in cols_to_keep]
taxonomic_features.drop(cols_to_drop, axis=1, inplace=True)
taxonomic_features.reset_index(inplace=True)
taxonomic_features.rename(columns={"taxonRank": "taxonomic_rank", "index": "original_name"}, inplace=True)

  classification_data = pd.read_csv(classification_db_path, sep="\t")


In [12]:
def get_genus(name):
    if len(name) == 1:
        return np.nan
    components = name.split(" ")
    if "." in components[0]:
        return np.nan
    return components[0]

taxonomic_features.loc[taxonomic_features.taxonomic_rank == "GENUS", "genus"] = taxonomic_features.loc[taxonomic_features.taxonomic_rank == "GENUS", "original_name"]
taxonomic_features.loc[taxonomic_features.taxonomic_rank == "FAMILY", "family"] = taxonomic_features.loc[taxonomic_features.taxonomic_rank == "FAMILY", "original_name"]
taxonomic_features.loc[(taxonomic_features.genus.isna()) & (taxonomic_features.resolved_name.notna()), "genus"] = taxonomic_features.loc[(taxonomic_features.genus.isna()) & (taxonomic_features.resolved_name.notna()), "resolved_name"].apply(lambda name: name.split(" ")[0])
taxonomic_features.loc[taxonomic_features.genus.isna(), "genus"] = taxonomic_features.loc[taxonomic_features.genus.isna(), "original_name"].apply(get_genus)

genus_to_family = taxonomic_features[["genus", "family"]].drop_duplicates().dropna().set_index("genus")["family"].to_dict()
taxonomic_features.set_index("genus", inplace=True)
taxonomic_features.family.fillna(value=genus_to_family, inplace=True)
taxonomic_features.reset_index(inplace=True)                              

In [13]:
taxonomic_features.notna().sum() / taxonomic_features.shape[0]

genus             0.986812
original_name     1.000000
network_type      1.000000
network_id        1.000000
resolved_name     0.797926
taxonomic_rank    0.848040
family            0.900731
dtype: float64

In [14]:
# fill missing genera data using Entrez
missing_genera = taxonomic_features.loc[taxonomic_features.family.isna()].genus.dropna().unique().tolist()
genus_to_id = dict()
for genus in missing_genera:
    try:
        genus_to_id[genus] = Entrez.read(Entrez.esearch(term=genus, db="taxonomy", retmode="xml"))["IdList"][0]
    except Exception as e:
        print(f"could not find id for genus {genus} due to error {e}")

could not find id for genus abundance" due to error list index out of range
could not find id for genus actea due to error list index out of range
could not find id for genus adehis due to error list index out of range
could not find id for genus alium due to error list index out of range
could not find id for genus allibertia due to error list index out of range
could not find id for genus amoebophyllum due to error list index out of range
could not find id for genus ampetopsis due to error list index out of range
could not find id for genus appolonias due to error list index out of range
could not find id for genus aragalus due to error list index out of range
could not find id for genus armcae due to error list index out of range
could not find id for genus as due to error list index out of range
could not find id for genus ascerates due to error list index out of range
could not find id for genus baeckia due to error list index out of range
could not find id for genus befaria due t

In [15]:
def get_family(genus_id):
    d = list(Entrez.parse(Entrez.efetch(id=genus_id, db="taxonomy", retmode="xml")))[0]
    family = [item for item in d["LineageEx"] if item["Rank"] == "family"][0]["ScientificName"]
    return family

failed_genera = []
genus_to_family = dict()
for genus in genus_to_id:
    try:
        genus_to_family[genus] = get_family(genus_to_id[genus])
    except Exception as e:
        print(f"failed to get family for genus {genus} due to error {e}")
        failed_genera.append(genus)
# all appear to be pollinator families coming from the unreversed network binary/31

failed to get family for genus magnoliidae due to error list index out of range
failed to get family for genus unidentified due to error list index out of range


In [16]:
taxonomic_features.set_index("genus", inplace=True)
taxonomic_features.family.fillna(genus_to_family, inplace=True)
taxonomic_features.reset_index(inplace=True)
taxonomic_features.notna().sum() / taxonomic_features.shape[0]

genus             0.986812
original_name     1.000000
network_type      1.000000
network_id        1.000000
resolved_name     0.797926
taxonomic_rank    0.848040
family            0.974044
dtype: float64

In [17]:
taxonomic_features.to_csv("../../data/features/taxonomic_features.csv", index=False)

In [18]:
binary_plant_features_path = "../../data/features/plant/binary/features_with_classification.csv"
weighted_plant_features_path = "../../data/features/plant/weighted/features_with_classification.csv"
binarized_weighted_plant_features_path = "../../data/features/plant/binarized_weighted/features_with_classification.csv"

binary_plant_features = pd.read_csv(binary_plant_features_path)
weighted_plant_features = pd.read_csv(weighted_plant_features_path)
binarized_weighted_plant_features = pd.read_csv(binarized_weighted_plant_features_path)

In [19]:
name_to_genus = taxonomic_features.set_index("original_name")["genus"].to_dict()
name_to_family = taxonomic_features.set_index("original_name")["family"].to_dict()

In [20]:
taxonomic_features.set_index("original_name", inplace=True)
binary_plant_features.set_index("original_name", inplace=True)
weighted_plant_features.set_index("original_name", inplace=True)
binarized_weighted_plant_features.set_index("original_name", inplace=True)

binary_plant_features[["genus", "family"]] = np.nan
binary_plant_features["genus"].fillna(name_to_genus, inplace=True)
binary_plant_features["family"].fillna(name_to_family, inplace=True)

weighted_plant_features[["genus", "family"]] = np.nan
weighted_plant_features["genus"].fillna(name_to_genus, inplace=True)
weighted_plant_features["family"].fillna(name_to_family, inplace=True)

binarized_weighted_plant_features[["genus", "family"]] = np.nan
binarized_weighted_plant_features["genus"].fillna(name_to_genus, inplace=True)
binarized_weighted_plant_features["family"].fillna(name_to_family, inplace=True)

taxonomic_features.reset_index(inplace=True)
binary_plant_features.reset_index(inplace=True)
weighted_plant_features.reset_index(inplace=True)
binarized_weighted_plant_features.reset_index(inplace=True)

In [21]:
binary_plant_features.to_csv(binary_plant_features_path)
weighted_plant_features.to_csv(weighted_plant_features_path)
binarized_weighted_plant_features.to_csv(binarized_weighted_plant_features_path)

In [22]:
networks_with_missing_taxonomy_data = taxonomic_features.groupby(["network_type", "network_id"])[["taxonomic_rank", "resolved_name"]].agg({"taxonomic_rank": lambda fams: 1 if len(fams.dropna()) > 0 else 0,
                                                                                                                                           "resolved_name": lambda fams: 1 if len(fams.dropna()) > 0 else 0}).reset_index()

In [None]:
networks_with_missing_taxonomy_data

Unnamed: 0,network_type,network_id,taxonomic_rank,resolved_name
0,binary,0,1,1
1,binary,1,0,0
2,binary,2,1,1
3,binary,3,1,1
4,binary,4,1,1
...,...,...,...,...
714,weighted,533,1,1
715,weighted,534,1,1
716,weighted,535,1,1
717,weighted,536,1,1


In [52]:
# reversed:
# binary: 31, 

# marine: 
# binary: 46, 48, 50, 51, 54, 55, 56, 57, 61, 62, 63, 64, 66, 70, 71, 72, 73

# in codes:
# weighted: 106, 107, 156, 157, 158, 159