In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append("../../code/data_processing/name_resolution/")
from resolved_names_curator import ResolvedNamesCurator

In [2]:
networks_dir = f"../../data/networks/all/"
networks_metadata_path = f"../../data/networks/networks_metadata.csv"

In [3]:
networks_metadata = pd.read_csv(networks_metadata_path)
networks_metadata["plants"] = np.nan
networks_metadata["pollinators"] = np.nan

for nt in ["binary", "weighted", "binarized_weighted"]:
    for p in os.listdir(f"{networks_dir}/{nt}/"):
        if not p.endswith(".csv"):
            continue
        ni = int(p.replace(".csv", ""))
        net = pd.read_csv(f"{networks_dir}/{nt}/{p}")
        plants = net.Plant.str.lower().tolist()
        pollinators = [s.lower() for s in net.columns if s != "Plant"]
        networks_metadata.loc[(networks_metadata.network_type == nt) & (networks_metadata.network_index == ni), "plants"] = str(plants)
        networks_metadata.loc[(networks_metadata.network_type == nt) & (networks_metadata.network_index == ni), "pollinators"] = str(pollinators)

In [4]:
def get_taxonomic_level(unresovled_names):
    names = resolved_names.loc[resolved_names.original_name.isin(unresovled_names), "resolved_name"].dropna().unique()
    if len(names) == 0:
        return np.nan
    for name in names:
        if " " in name:
            return "species"
    return "genus or above"

# plant name resolution

In [2]:
unresolved_names_path=f"../../data/name_resolution/unresolved_plant_names.csv"
resolved_names_path=f"../../data/name_resolution/resolved_plant_names.csv"

unresolved_names = pd.read_csv(unresolved_names_path)
resolved_names = pd.read_csv(resolved_names_path).rename(columns={"Coded Name": "resolved_name",
                                                                  "Original name": "original_name",
                                                                  "Matched Name": "matched_name",
                                                                  "Coded Authority": "authority"})
resolved_names.resolved_name = resolved_names.resolved_name.apply(lambda name: name.replace("_", " ") if pd.notna(name) else np.nan)
try:
    resolved_names.loc[resolved_names.resolved_name.notna(), "matched_name_wo_authority"] = resolved_names.loc[resolved_names.resolved_name.notna()][["matched_name", "authority"]].apply(lambda record: record.matched_name.replace(f" {record.authority}", "").replace(" None",""), axis=1)
except:
    pass

In [3]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names.query('resolved_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 5,400
# resolved names = 3,954
# unique resolved names = 3,874
% coverage by name resolution = 73.41%


In [4]:
genus_names = resolved_names.loc[~resolved_names.resolved_name.str.contains(" ", na=False), "resolved_name"].tolist()
print(f"# names resolved at genus level = {len(genus_names):,}")

# names resolved at genus level = 2,204


## compute taxonomic level of plants within the networks

In [37]:
networks_metadata["plants_taxonomic_level"] = networks_metadata.plants.apply(lambda plants: get_taxonomic_level([s.split("'")[1] for s in plants.split(",")]))
print(f"number of networks with plants resovled at species level")
networks_metadata.query("plants_taxonomic_level == 'species'").groupby("network_type")["network_index"].count()

number of networks with plants resovled at species level


network_type
binarized_weighted    705
binary                121
weighted              705
Name: network_index, dtype: int64

In [38]:
resolved_names.reset_index(inplace=True)

In [39]:
missing_names = resolved_names.query("resolved_name.isna()").original_name.tolist()
print(f"# missing names = {len(missing_names):,}")

# missing names = 1,266


In [40]:
resolved_names_to_doc = resolved_names[["original_name", "matched_name", "resolved_name"]]
resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
resolved_names_to_doc.resolved_name = resolved_names_to_doc.resolved_name.str.lower()
resolved_names_to_doc.to_csv(resolved_names_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.resolved_name

# pollinators name resolution

In [6]:
unresolved_names_path=f"../../data/name_resolution/unresolved_pollinator_names.csv"
resolved_names_path=f"../../data/name_resolution/resolved_pollinator_names.csv"
tax_data_path = "../../data/name_resolution/itis_taxonomic_data.csv"

unresolved_names = pd.read_csv(unresolved_names_path)

In [7]:
resolved_names = pd.read_csv(resolved_names_path).rename(columns={"Coded Name": "resolved_name",
                                                                  "Original name": "original_name",
                                                                  "Matched Name": "matched_name",
                                                                  "Coded Authority": "authority"})
resolved_names.resolved_name = resolved_names.resolved_name.apply(lambda name: name.replace("_", " ") if pd.notna(name) else np.nan)
try:
    resolved_names.loc[resolved_names.resolved_name.notna(), "matched_name_wo_authority"] = resolved_names.loc[resolved_names.resolved_name.notna()][["matched_name", "authority"]].apply(lambda record: record.matched_name.replace(f" {record.authority}", "").replace(" None",""), axis=1)
except:
    pass

In [8]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names.query('resolved_name.notna()').drop_duplicates(subset=['original_name']).shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 15,073
# resolved names = 4,486
# unique resolved names = 3,725
% coverage by name resolution = 29.76%


In [9]:
genus_names = resolved_names.loc[~resolved_names.resolved_name.str.contains(" ", na=False), "resolved_name"].drop_duplicates().tolist()
print(f"# names resolved at genus level = {len(genus_names):,}")

# names resolved at genus level = 1,087


In [10]:
resolved_names.reset_index(inplace=True)

In [11]:
missing_names = resolved_names.query("resolved_name.isna()").original_name.tolist()
print(f"# missing names = {len(missing_names):,}")

# missing names = 9,732


In [12]:
networks_metadata["pollinators_taxonomic_level"] = networks_metadata.pollinators.apply(lambda pollinators: get_taxonomic_level([s.replace("[", "").replace("]","").replace("'", "").strip() for s in pollinators.split("', '")]))
networks_metadata = networks_metadata.drop(["plants", "pollinators"], axis=1)
print(f"number of networks with pollinators resovled at species level")
networks_metadata.query("pollinators_taxonomic_level == 'species'").groupby("network_type")["network_index"].count()

number of networks with pollinators resovled at species level


network_type
binarized_weighted    691
binary                137
weighted              691
Name: network_index, dtype: int64

In [13]:
print(f"number of networks with plants and pollinators resovled at species level")
networks_metadata.to_csv(networks_metadata_path, index=False)
networks_metadata.loc[(networks_metadata.pollinators_taxonomic_level == 'species') & (networks_metadata.plants_taxonomic_level == 'species')].groupby("network_type")["network_index"].count()

number of networks with plants and pollinators resovled at species level


network_type
binarized_weighted    688
binary                119
weighted              688
Name: network_index, dtype: int64

In [14]:
resolved_names_to_doc = resolved_names[["original_name", "matched_name", "resolved_name"]]
resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
resolved_names_to_doc.resolved_name = resolved_names_to_doc.resolved_name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.resolved_name

In [16]:
tax_data = pd.read_csv(tax_data_path)
tax_data = tax_data[["complete_name", "rank_name"]]
tax_data["complete_name"] = tax_data["complete_name"].str.lower()

  tax_data = pd.read_csv(tax_data_path)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tax_data["complete_name"] = tax_data["complete_name"].str.lower()


In [17]:
resolved_names_to_doc = resolved_names_to_doc.merge(tax_data, left_on="resolved_name", right_on="complete_name", how="left")
resolved_names_to_doc.set_index("original_name", inplace=True)
resolved_names_to_doc.fillna(tax_data.set_index("complete_name")["rank_name"].to_dict(), inplace=True)
resolved_names_to_doc.reset_index(inplace=True)
resolved_names_to_doc.to_csv(resolved_names_path, index=False)

In [18]:
resolved_names_to_doc = pd.read_csv(resolved_names_path)
resolved_names_to_doc = resolved_names_to_doc.drop_duplicates()

In [19]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names_to_doc.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names_to_doc.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names_to_doc.query('resolved_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")
print(f"% records with available rank = {np.round(resolved_names_to_doc.query('rank_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 15,073
# resolved names = 4,486
# unique resolved names = 3,725
% coverage by name resolution = 29.78%
% records with available rank = 29.46%


In [20]:
resolved_names_to_doc.loc[(resolved_names_to_doc.rank_name.notna())].shape[0] / resolved_names_to_doc.shape[0]

0.3205572397863433

In [36]:
resolved_names_to_doc.rank_name.unique()

array([nan, 'Species', 'Genus', 'Order', 'Family', 'Subfamily', 'Tribe',
       'Superfamily', 'Suborder', 'Subclass', 'Class'], dtype=object)

In [21]:
print(f"# unique pollinator names = {len(unresolved_names.Name.unique()):,}")

# unique pollinator names = 15,033


In [50]:
test = resolved_names_to_doc.dropna(subset=["resolved_name"])
test.groupby("rank_name")["resolved_name"].count()

rank_name
Class             2
Family          443
Genus          1014
Order            55
Species        2837
Subclass          2
Subfamily        41
Suborder         10
Superfamily      15
Tribe            22
Name: resolved_name, dtype: int64

## update networks pollinator indices

In [44]:
df = pd.read_csv(networks_metadata_path)

In [45]:
rank_name_to_index= {'Species': 0, 'Genus': 1, 'Subfamily': 2, 'Family': 3,  'Superfamily': 4, 'Suborder': 5, 'Order': 6, 'Subclass': 7, 'Class': 8, 'Tribe': 9, np.nan: 10}
resolved_names_to_doc["rank_index"] = resolved_names_to_doc.rank_name.replace(rank_name_to_index)

def get_pollinators_ranks(path: str):
    network = pd.read_csv(path)
    pollinators =  [s.lower() for s in set(network.columns)-{"Unnamed: 0", "Plant"}]
    resolved_pollinators_ranks = resolved_names_to_doc.loc[resolved_names_to_doc.original_name.isin(pollinators)][["rank_name", "rank_index"]].sort_values("rank_index")
    if resolved_pollinators_ranks.shape[0] == 0:
        return np.nan, 10
    rec = resolved_pollinators_ranks.iloc[0]
    return rec.rank_name, rec.rank_index
    
df[["lowest_pollinator_rank_name", "lowest_pollinator_rank_index"]] = df.apply(lambda rec: get_pollinators_ranks(rec.processed_path), axis=1, result_type="expand")

In [47]:
df[["network_type", "network_index", "lowest_pollinator_rank_name", "highest_pollinator_rank_name", "lowest_pollinator_rank_index", "highest_pollinator_rank_index"]].head()

Unnamed: 0,network_type,network_index,lowest_pollinator_rank_name,highest_pollinator_rank_name,lowest_pollinator_rank_index,highest_pollinator_rank_index
0,binarized_weighted,0,Family,Family,3.0,6.0
1,binarized_weighted,1,Family,Family,3.0,6.0
2,binarized_weighted,2,Family,Family,3.0,6.0
3,binarized_weighted,3,Family,Family,3.0,6.0
4,binarized_weighted,4,Family,Family,3.0,6.0


In [52]:
df = df.drop(["highest_pollinator_rank_name", "highest_pollinator_rank_index"], axis=1).rename(columns={"lowest_pollinator_rank_name": "pollinators_rank_name",
                                                                                                        "lowest_pollinator_rank_index": "pollinators_rank_index"})
df.to_csv(networks_metadata_path, index=False)