# Taxonomic Token Identify

# Description


# Environment

See https://medium.com/@jhurley_97842/create-a-template-for-your-jupyter-notebooks-80352d265cd4 for how the notebook was created

## Library Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import re
import json
import more_itertools
import requests
import urllib
import ftfy

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy import displacy

## Jupyter-specific Imports and Settings

In [None]:
# Data manipulation
# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')

get_ipython().run_line_magic('autoreload', '2')

## Local Imports

## File Paths

In [None]:
# https://medium.com/@rrfd/cookiecutter-data-science-organize-your-projects-atom-and-jupyter-2be7862f487e
# Base Path
base_path = Path.cwd()

# Data paths
data_path = base_path / 'data'
raw_data_path = data_path / 'raw'
interim_data_path = data_path / 'interim'
processed_data_path = data_path / 'processed'
external_data_path = data_path / 'external'

# Reports paths
reports_path = base_path / 'reports'
figures_path = reports_path / 'figures'

# Input paths
taxonomy_name = 'eBird-Clements-v2019-integrated-checklist-August-2019.xlsx'
taxonomy_path = external_data_path / taxonomy_name

test_data_path = raw_data_path / 'raw-spacytest-small.txt'

# Cache paths
entity_ruler_path = interim_data_path / 'taxon_entity_ruler.jsonl'

# Outputs paths

# Credentials

## Constants and Globals

In [None]:
# Constants and Globals
# Just for readability
ua_p1 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) '
ua_p2 = 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'
REQUESTS_USER_AGENT = ua_p1 + ua_p2

# Code

In [None]:
def create_project_paths():
    default_mode = 0o755
    data_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    raw_data_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    interim_data_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    processed_data_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    external_data_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    reports_path.mkdir(mode=default_mode, parents=False, exist_ok=True)
    figures_path.mkdir(mode=default_mode, parents=False, exist_ok=True)

In [None]:
def get_taxonomy_from_url(url):
    taxonomy_path = external_data_path / taxonomy_name

    try:
        xheaders={'User-Agent': REQUESTS_USER_AGENT}

        rr = requests.get(url, params=None, headers=xheaders, stream=True)

        if rr.status_code == requests.codes.ok:
            with open(taxonomy_path, 'wb') as fp:
                _ = fp.write(rr.content)
        rr.raise_for_status()

    except Exception as ee:
        print(ee)

    return rr.status_code

In [None]:
def get_taxonomy_cached() -> pd.DataFrame:
    taxonomy_df = pd.DataFrame()
    try:
        if not taxonomy_path.is_file():
            url_base = 'https://www.birds.cornell.edu/clementschecklist/wp-content/uploads/2019/08'
            url = f'{url_base}/{taxonomy_name}'
            print(f'Retrieving taxonomy for cache...')
            get_taxonomy_from_url(url)
            
        if taxonomy_path.is_file():
            taxonomy_df = pd.read_excel(taxonomy_path, header=0).fillna('')
            new_tax_columns = {
                'English name': 'comName',
                'scientific name': 'sciName',
                'family': 'familySciName',
                'eBird species group': 'familyComName'
            }
            taxonomy_df.rename(columns=new_tax_columns, inplace=True)

    except Exception as ee:
        print(ee)
        pass

    return taxonomy_df

In [None]:
def get_test_text(test_path: Path) -> str:
    print(f'Processing {test_path.stem}')
    with open(test_path, 'r', encoding="utf-8") as fp:
        text = fp.read()

    # Generally helpful to clean up the text at least a little bit
    text = ftfy.fix_text(text, fix_encoding=True, fix_line_breaks=True,
                         normalization='NFKC',
                         fix_latin_ligatures=True, uncurl_quotes=True)
    
    return text

In [None]:
def save_visualization(test_path: Path, html:str):
    # Pass in original filename
    out_path = reports_path / f'spacy-{test_path.stem}.html'
    with open(out_path, 'w', encoding="utf-8") as fp: #, encoding="utf-8"
        _ = fp.write(html)

In [None]:
def create_taxon_patterns(nlp, values:pd.Series, label:str, taxon_patterns:list):
    values = list(set(values.values))
    if '' in values:
        values.remove('')
    for val in values:
        # use word_tokenize to properly group "'s"
        tokens = nlp.tokenizer(val)
        patterns = [{'LOWER': str(tok).lower()} for tok in tokens]
        taxon_pattern = {'label': label, 'pattern': patterns}
        taxon_patterns.append(taxon_pattern)

In [None]:
def add_taxon_patterns(nlp, taxonomy):
    print('Preparing taxon patterns')

    taxon_patterns = []
    
    create_taxon_patterns(nlp, taxonomy.comName, 'CommonName', taxon_patterns)
    create_taxon_patterns(nlp, taxonomy.sciName, 'ScientificName', taxon_patterns)
    create_taxon_patterns(nlp, taxonomy.order, 'Order', taxon_patterns)
    create_taxon_patterns(nlp, taxonomy.familyComName, 'FamilyCommon', taxon_patterns)
    create_taxon_patterns(nlp, taxonomy.familySciName, 'FamilyScientific', taxon_patterns)
    
    return taxon_patterns

In [None]:
def get_entity_ruler_cached(nlp, taxonomy) -> EntityRuler:
    ruler = EntityRuler(nlp, validate=True)
    try:
        if not entity_ruler_path.is_file():
            taxon_patterns = add_taxon_patterns(nlp, taxonomy)
            ruler.add_patterns(taxon_patterns)
            ruler.to_disk(entity_ruler_path)
            return ruler
        
        if entity_ruler_path.is_file():
            print('Loading EntityRuler from cache...')
            ruler.from_disk(entity_ruler_path)

    except Exception as ee:
        print(ee)
        pass

    return ruler

In [None]:
# Read and process text
def spacify_text(text, taxonomy):
    # Only process unique lines
    processed_text = '\n'.join(list(set(text.split('\n'))))

    nlp = English()

    ruler = get_entity_ruler_cached(nlp, taxonomy)
    nlp.add_pipe(ruler)

    print('Processing text')
    doc = nlp(processed_text.lower())

    return doc

In [None]:
def create_visualization(docx, show_in_jupyter=True):
    # Create visualization
    # https://developer.mozilla.org/en-US/docs/Web/CSS/linear-gradient
    # https://cssgradient.io
    # https://htmlcolorcodes.com
    
    print('Creating visualization')
    
    purplish   = 'linear-gradient(90deg, #aa9cfc, #fc9ce7)' # original
    yellowish  = 'linear-gradient(90deg, #f9fc9c, #fac945)'
    greenish   = 'linear-gradient(90deg, #cdfc9c, #5cfa45)'
    aquaish    = 'linear-gradient(90deg, #9cfcea, #3cd3e7)'
    fuchsiaish = 'linear-gradient(90deg, #fc9cde, #ff5aa4)'

    colors = {
        "COMMONNAME": purplish,
        'SCIENTIFICNAME': aquaish,
        'ORDER': greenish,
        'FAMILYCOMMON': yellowish,
        'FAMILYSCIENTIFIC': fuchsiaish
    }
    options = {"ents": ["COMMONNAME", 'SCIENTIFICNAME', 'ORDER', 
                        'FAMILYCOMMON', 'FAMILYSCIENTIFIC'], 
               "colors": colors}

    # displacy.serve(doc, style="ent", options=options)
    html = displacy.render([docx], style="ent", page=True, 
                           jupyter=show_in_jupyter, options=options)

    return html

In [None]:
def show_species_and_families(docx):
    families = set()
    species = set()
    for ent in docx.ents:
        if ent.label_ == 'FamilyCommon':
            families.add(ent.text)
        elif ent.label_ == 'CommonName':
            species.add(ent.text)
    #     print(ent.text, ent.start_char, ent.end_char, ent.label_)
    xspecies = ', '.join(sorted(list(species)))
    xfamilies = ', '.join(sorted(list(families)))

    print(f'Species: {xspecies}')
    print(f'Families: {xfamilies}')

## Initialization

In [None]:
# Initializations

taxonomy = get_taxonomy_cached()

In [None]:
# Some other files to try
#     test_data_path = processed_data_path / 'raw-spacytest-small-article.txt'
#     test_data_path = processed_data_path / 'the-119th-christmas-bird-count-summary.txt'
#     test_data_path = processed_data_path / 'raw-pdf-txt-1-cl-CACR-CMHCBC_-_Check_List2018.txt'

# Main

In [None]:
if __name__ == '__main__':
    
    create_project_paths()
    
    test_data_path = processed_data_path / 'raw-spacytest-small.txt'
    text = get_test_text(test_data_path)
    
    docx = spacify_text(text, taxonomy)
    show_species_and_families(docx)
    
    html = create_visualization(docx, show_in_jupyter=True)

    html = create_visualization(docx, show_in_jupyter=False)
    save_visualization(test_data_path, html)
    
    print('Done')