# Imports

In [1]:
import os
import pandas as pd
import spacy 
from spacy import displacy
import sys
import nltk
from nltk.corpus import stopwords
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from scripts.fuzzy_phrase_matcher import PhuzzyMatcher
from scripts.utils import fuzzy_matcher, fuzzy_matcher_stopwords

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add(".")
stop_words.add(",")
stop_words.add("!")
stop_words.add("(")
stop_words.add(")")

[nltk_data] Downloading package stopwords to /home/sm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load list of animals to fuzzy match in document

In [3]:
my_path = os.path.abspath('../match_lists')
path = os.path.join(my_path, 'animal_names.csv')
animals = pd.read_csv(path)
animals = animals["species"]
animals_sorted = sorted(animals.str.lower(), key=len)
animals_set = set(animals_sorted)


In [4]:
animals_set

{'allen’s swamp monkeys',
 'alpine ibex',
 'amazonian manatee',
 'american oystercatcher bird',
 'and',
 'asian elephant',
 'atlas beetle',
 'bengal tiger',
 'black widow spider',
 'blister beetle',
 'brazilian wandering spider',
 'broad-snouted caimans',
 'campbell’s dwarf hamster',
 'chinstrap penguin',
 'common kingfisher',
 'common palm civet'}

# Load blank spacy model and add PhuzzyMatcher to pipeline

In [None]:
nlp = spacy.blank('en')
animal_tagger =  PhuzzyMatcher(nlp, animals_set, fuzzy_matcher, 85, "ANIMAL")
nlp.add_pipe(animal_tagger)
print(nlp.pipe_names)

# Detect wrongly written animal names

In [None]:
%%time
doc = nlp("The Atlas Beatle has always been a friend of the blisster beetles and the uncommon kingfisher got very jealous about it. Luckily, the mexican wandering spider got it right")
displacy.render(doc, style="ent")



# Change fuzz ratio

In [7]:
nlp = spacy.blank('en')
animal_tagger =  PhuzzyMatcher(nlp, animals_set, fuzzy_matcher, 70, "ANIMAL")
nlp.add_pipe(animal_tagger)
print(nlp.pipe_names)

['phuzzy_matcher']


In [9]:
%%time
doc = nlp("The Atlas Beatle has always been a friend of the blisster beetles and the uncommon kingfisher got very jealous about it. Luckily, the mexican wandering spider got it right")
displacy.render(doc, style="ent")

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 27.8 ms


# PhuzzyMatcher with stopwords option
### 'and' is not assigned 'ANIMAL' because removed from doc before fuzzy match

In [11]:
nlp = spacy.blank('en')
animal_tagger =  PhuzzyMatcher(nlp, animals_set, fuzzy_matcher_stopwords, 70, "ANIMAL", stop_words)
nlp.add_pipe(animal_tagger)
print(nlp.pipe_names)

['phuzzy_matcher']


In [12]:
%%time
doc = nlp("The Atlas Beatle has always been a friend of the blisster beetles and the uncommon kingfisher got very jealous about it. Luckily, the mexican wandering spider got it right")
displacy.render(doc, style="ent")

CPU times: user 11.5 ms, sys: 0 ns, total: 11.5 ms
Wall time: 14.1 ms
