## download models
* https://spacy.io/usage/models

## everything about spacy
* https://spacy.io/usage/spacy-101 
* https://spacy.io/api/matcher

## add a pipeline

* how to use nlp.pipe and disable some component
* Multiprocessing: https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
* https://spacy.io/usage/processing-pipelines

## make a classifier

* https://spacy.io/universe/project/classyclassification





In [1]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])

doc = nlp("Hello, new world! Hello old world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)


In [4]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [2]:
import spacy
from spacy_lookup import Entity

nlp = spacy.load("en_core_web_sm")
entity = Entity(keywords_list=['python', 'product manager', 'java platform'])
nlp.add_pipe(entity, last=True)

doc = nlp(u"I am a product manager for a java and python.")
assert doc._.has_entities == True
assert doc[0]._.is_entity == False
assert doc[3]._.entity_desc == 'product manager'
assert doc[3]._.is_entity == True

print([(token.text, token._.canonical) for token in doc if token._.is_entity])


ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy_lookup.Entity object at 0x7f894c68d400> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [9]:
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
# keyword_processor.add_keyword(<unclean name>, <standardised name>)
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.', span_info=True)

In [24]:
keyword_processor = KeywordProcessor()
clean_names  = ['ORG', 'LOC', 'SF', "ORG", 'ORG1']
keyword_names = ['new york', 'new york', 'san francisco', 'new york', 'new york loc']
for keyword_name, clean_name in zip(keyword_names, clean_names):
    keyword_processor.add_keyword(keyword_name, clean_name)
keywords_found = keyword_processor.extract_keywords('I love san francisco and NY. new york loc is the best.', span_info=True)
print(keywords_found)


[('SF', 7, 20), ('ORG1', 29, 41)]


In [1]:
from spacy.lang.en import English
from spacy.matcher import Matcher, PhraseMatcher

nlp = English()
# nlp = spacy.load("en_core_web_sm")

ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)

doc = nlp("en hello Apple is opening its first big office in San Francisco. again")
print([(ent.text, ent.label_, ent.start, ent.end, ent.start_char, ent.end_char) for ent in doc.ents])

# for token in doc:
#     print(token.text, token.pos_, token.dep_, token.ent_type_)
    

matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
radio_pattern = [{"OP": "*"}, {"LOWER": {"IN": ["en", "cute"]}},
                 {"OP": "*"}, {"ENT_TYPE": "ORG"}, {"OP": "*"}, {"ENT_TYPE": "GPE", "OP": "+"}]
matcher.add("RadioClas", [radio_pattern])

# doc = nlp("Hello, world! Hello world!")

matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)


[('Apple', 'ORG', 2, 3, 9, 14), ('San Francisco', 'GPE', 10, 12, 50, 63)]
6550644825991781785 Radio 0 11 en hello Apple is opening its first big office in San
6550644825991781785 Radio 0 12 en hello Apple is opening its first big office in San Francisco


In [1]:
from spacy.lang.pt import Portuguese

nlp = Portuguese()
doc = nlp("Tocar a música não deixe o samba morrer 123")

print(len(doc))
for token in doc:
    print(token.text, token.is_alpha, token.ent_type_, token.lemma_)

2022-07-27 16:15:33.688354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:73:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-27 16:15:33.688838: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:d5:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-27 16:15:33.727083: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:73:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-27 16:15:33.727469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not open file to read NUMA node: /sys/bus/pci/devices/0000:d5:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-27 16:15:33.727800: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:922] could not op

9
Tocar True  
a True  
música True  
não True  
deixe True  
o True  
samba True  
morrer True  
123 False  
