In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append("../../code/data_processing/name_resolution/")
from resolved_names_curator import ResolvedNamesCurator

# plant name resolution

In [2]:
unresolved_names_path=f"../../data/name_resolution/unresolved_plant_names.csv"
resolved_names_path=f"../../data/name_resolution/resolved_plant_names.csv"

unresolved_names = pd.read_csv(unresolved_names_path)
resolved_names = pd.read_csv(resolved_names_path).rename(columns={"Coded Name": "resolved_name",
                                                                  "Original name": "original_name",
                                                                  "Matched Name": "matched_name",
                                                                  "Coded Authority": "authority"})
resolved_names.resolved_name = resolved_names.resolved_name.apply(lambda name: name.replace("_", " ") if pd.notna(name) else np.nan)
try:
    resolved_names.loc[resolved_names.resolved_name.notna(), "matched_name_wo_authority"] = resolved_names.loc[resolved_names.resolved_name.notna()][["matched_name", "authority"]].apply(lambda record: record.matched_name.replace(f" {record.authority}", "").replace(" None",""), axis=1)
except:
    pass

In [4]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names.query('resolved_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 5,334
# resolved names = 3,927
# unique resolved names = 3,845
% coverage by name resolution = 73.79%


In [5]:
genus_names = resolved_names.loc[~resolved_names.resolved_name.str.contains(" ", na=False), "resolved_name"].tolist()
print(f"# names resolved at genus level = {len(genus_names):,}")

# names resolved at genus level = 2,201


In [6]:
resolved_names.reset_index(inplace=True)

In [7]:
missing_names = resolved_names.query("resolved_name.isna()").original_name.tolist()
print(f"# missing names = {len(missing_names):,}")

# missing names = 1,263


In [8]:
resolved_names_to_doc = resolved_names[["original_name", "matched_name", "resolved_name"]]
resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
resolved_names_to_doc.resolved_name = resolved_names_to_doc.resolved_name.str.lower()
resolved_names_to_doc.to_csv(resolved_names_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.resolved_name

# pollinators name resolution

In [9]:
unresolved_names_path=f"../../data/name_resolution/unresolved_pollinator_names.csv"
resolved_names_path=f"../../data/name_resolution/resolved_pollinator_names.csv"
tax_data_path = "../../data/name_resolution/itis_taxonomic_data.csv"

unresolved_names = pd.read_csv(unresolved_names_path)

In [10]:
resolved_names = pd.read_csv(resolved_names_path).rename(columns={"Coded Name": "resolved_name",
                                                                  "Original name": "original_name",
                                                                  "Matched Name": "matched_name",
                                                                  "Coded Authority": "authority"})
resolved_names.resolved_name = resolved_names.resolved_name.apply(lambda name: name.replace("_", " ") if pd.notna(name) else np.nan)
try:
    resolved_names.loc[resolved_names.resolved_name.notna(), "matched_name_wo_authority"] = resolved_names.loc[resolved_names.resolved_name.notna()][["matched_name", "authority"]].apply(lambda record: record.matched_name.replace(f" {record.authority}", "").replace(" None",""), axis=1)
except:
    pass

In [11]:
resolved_names

Unnamed: 0,original_name,matched_name,resolved_name,complete_name,rank_name
0,miridae,miridae,miridae,miridae,Family
1,miridae,miridae,miridae,miridae,Family
2,miridae,miridae,miridae,miridae,Family
3,miridae,miridae,miridae,miridae,Family
4,miridae,miridae,miridae,miridae,Family
...,...,...,...,...,...
42202,coleothorpa,coleothorpa moldenke@ 1981,coleothorpa,coleothorpa,Genus
42203,unidentified sp1,,,,
42204,pogonomyoides segnis,,,,
42205,dasyphora albofasciata,,,,


In [25]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names.query('resolved_name.notna()').drop_duplicates(subset=['original_name']).shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 14,868
# resolved names = 4,432
# unique resolved names = 3,677
% coverage by name resolution = 29.81%


In [27]:
genus_names = resolved_names.loc[~resolved_names.resolved_name.str.contains(" ", na=False), "resolved_name"].drop_duplicates().tolist()
print(f"# names resolved at genus level = {len(genus_names):,}")

# names resolved at genus level = 1,088


In [28]:
resolved_names.reset_index(inplace=True)

In [29]:
missing_names = resolved_names.query("resolved_name.isna()").original_name.tolist()
print(f"# missing names = {len(missing_names):,}")

# missing names = 9,622


In [30]:
resolved_names_to_doc = resolved_names[["original_name", "matched_name", "resolved_name"]]
resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
resolved_names_to_doc.resolved_name = resolved_names_to_doc.resolved_name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.original_name = resolved_names_to_doc.original_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.matched_name = resolved_names_to_doc.matched_name.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resolved_names_to_doc.resolved_name

In [31]:
tax_data = pd.read_csv(tax_data_path)
tax_data = tax_data[["complete_name", "rank_name"]]
tax_data["complete_name"] = tax_data["complete_name"].str.lower()

  tax_data = pd.read_csv(tax_data_path)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tax_data["complete_name"] = tax_data["complete_name"].str.lower()


In [32]:
resolved_names_to_doc = resolved_names_to_doc.merge(tax_data, left_on="resolved_name", right_on="complete_name", how="left")
resolved_names_to_doc.set_index("original_name", inplace=True)
resolved_names_to_doc.fillna(tax_data.set_index("complete_name")["rank_name"].to_dict(), inplace=True)
resolved_names_to_doc.reset_index(inplace=True)
resolved_names_to_doc.to_csv(resolved_names_path, index=False)

In [33]:
resolved_names_to_doc = pd.read_csv(resolved_names_path)
resolved_names_to_doc = resolved_names_to_doc.drop_duplicates()

In [34]:
print(f"# unresovled names = {unresolved_names.shape[0]:,}")
print(f"# resolved names = {len(resolved_names_to_doc.dropna(subset=['resolved_name']).original_name.unique()):,}")
print(f"# unique resolved names = {len(resolved_names_to_doc.dropna(subset=['resolved_name']).resolved_name.unique()):,}")
print(f"% coverage by name resolution = {np.round(resolved_names_to_doc.query('resolved_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")
print(f"% records with available rank = {np.round(resolved_names_to_doc.query('rank_name.notna()').shape[0]/unresolved_names.shape[0]*100,2)}%")

# unresovled names = 14,868
# resolved names = 4,432
# unique resolved names = 3,677
% coverage by name resolution = 29.83%
% records with available rank = 29.51%


In [35]:
resolved_names_to_doc.loc[(resolved_names_to_doc.rank_name.notna())].shape[0] / resolved_names_to_doc.shape[0]

0.31947276434605304

In [36]:
print(f"# unique pollinator names = {len(unresolved_names.Name.unique()):,}")

# unique pollinator names = 14,829


In [37]:
test = resolved_names_to_doc.dropna(subset=["resolved_name"])
test.groupby("rank_name")["resolved_name"].count()

rank_name
Class             2
Family          443
Genus          1013
Order            55
Species        2784
Subclass          2
Subfamily        42
Suborder          9
Subtribe          2
Superfamily      15
Tribe            20
Name: resolved_name, dtype: int64