### Coreference Resolution & NER
* Spacy itself (en_coreference_web_trf)

In [8]:
!pip freeze > requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# my environment (fc)

# Python == 3.9.19
# spacy == 3.7.6 (Latest)
# spacy-transformers == 1.3.5

In [7]:
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl#egg=en_coreference_web_trf

Collecting en-coreference-web-trf==3.4.0a0
  Downloading https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl (490.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m490.3/490.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m


In [8]:
!pip install spacy-experimental==0.6.0

Collecting spacy-experimental==0.6.0
  Downloading spacy_experimental-0.6.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (12 kB)
Downloading spacy_experimental-0.6.0-cp39-cp39-macosx_10_9_x86_64.whl (740 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m740.8/740.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy-experimental
  Attempting uninstall: spacy-experimental
    Found existing installation: spacy-experimental 0.6.4
    Uninstalling spacy-experimental-0.6.4:
      Successfully uninstalled spacy-experimental-0.6.4
Successfully installed spacy-experimental-0.6.0


In [13]:
!pip install spacy-transformers



In [1]:
import spacy
import spacy_experimental
from spacy import displacy
from wasabi import msg

In [2]:
nlp = spacy.load("en_coreference_web_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/Users/minjoo/opt/anaconda3/envs/ner/lib/python3.9/site-packages/spacy[0m

NAME                     SPACY            VERSION                              
en_core_web_sm           >=3.4.0,<3.5.0   [38;5;2m3.4.1[0m     [38;5;2m✔[0m
en_coreference_web_trf   >=3.3.0,<3.5.0   [38;5;2m3.4.0a0[0m   [38;5;2m✔[0m



In [4]:
# Input text for coreference resolution
text3 = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

In [5]:
# Process the text with SpaCy
doc = nlp(text3)

In [6]:
# Print out clusters -> Coreference Resolution OK
msg.info("Found clusters")
for cluster in doc.spans:
    print(f"{cluster}: {doc.spans[cluster]}")

[38;5;4mℹ Found clusters[0m
coref_clusters_1: [Google, the company]
coref_clusters_2: [Sebastian Thrun, him]


In [6]:
# since Spacy doesn't have direct coreference resolver
# we define lightweight function for resolving references in text
def resolve_references(doc) -> str:
    """
    Function for resolving references with the coref ouput
    doc (Doc): The Doc object processed by the coref pipeline
    RETURNS (str): The Doc string with resolved references
    """
    # token.idx : token.text
    token_mention_mapper = {}
    output_string = ""
    clusters = [
    val for key, val in doc.spans.items() if key.startswith("coref_cluster")
        ]

    # Iterate through every found cluster
    for cluster in clusters:
        first_mention = cluster[0]
        # Iterate through every other span in the cluster
        for mention_span in list(cluster)[1:]:
            # Set first_mention as value for the first token in mention_span in the token_mention_mapper
            token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
            for token in mention_span[1:]:
                # Set empty string for all the other tokens in mention_span
                token_mention_mapper[token.idx] = ""
    # Iterate through every token in the Doc
    for token in doc:
        # Check if token exists in token_mention_mapper
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        # Else add original token text
        else:
            output_string += token.text + token.whitespace_
    return output_string

In [9]:
msg.info("Document with resolved references --> Looks Great!")
print(resolve_references(doc))

[38;5;4mℹ Document with resolved references --> Looks Great![0m
When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of Google in Sebastian Thrun started .


In [10]:
# Print out component names -> since there's no NER tagger, we can't directly use this for Named Entity Recognition.

msg.info("Pipeline components")
for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}: {pipe}")

[38;5;4mℹ Pipeline components[0m
0: sentencizer
1: transformer
2: coref
3: span_resolver
4: span_cleaner


In [12]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
test_model = spacy.load('en_core_web_sm')

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [23]:
msg.info("if we check the components from the pipeline, we can see there's no transformer and coref in the basic model")
for i, pipe in enumerate(test_model.pipe_names):
    print(f"{i}: {pipe}")

[38;5;4mℹ if we check the components from the pipeline, we can see there's no
transformer and coref in the basic model[0m
0: tok2vec
1: tagger
2: parser
3: attribute_ruler
4: lemmatizer
5: ner


In [14]:
# So we are going to combine two different models

# This base model (nlp) can be anything from Spacy
!python -m spacy download en_core_web_md
# nlp = spacy.load("en_core_web_trf")
nlp = spacy.load("en_core_web_md")
# nlp = spacy.load("en_core_web_sm")

nlp_coref = spacy.load("en_coreference_web_trf")

# use replace_listeners for the coref components
nlp_coref.replace_listeners("transformer", "coref", ["model.tok2vec"])
nlp_coref.replace_listeners("transformer", "span_resolver", ["model.tok2vec"])

# we won't copy over the span cleaner
nlp.add_pipe("coref", source=nlp_coref)
nlp.add_pipe("span_resolver", source=nlp_coref)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')




<spacy_experimental.coref.span_resolver_component.SpanResolver at 0x7f892900fa60>

In [28]:
doc2= nlp(text3)

In [29]:
# Print out component names
# Now we have NER on the 5th!

msg.info("Pipeline components : Now we have NER on the 5th!")
for i, pipe in enumerate(nlp.pipe_names):
    print(f"{i}: {pipe}")

[38;5;4mℹ Pipeline components : Now we have NER on the 5th![0m
0: tok2vec
1: tagger
2: parser
3: attribute_ruler
4: lemmatizer
5: ner
6: coref
7: span_resolver


In [30]:
msg.info("Document with resolved references --> Still looking good with the new model")
print(resolve_references(doc2))

[38;5;4mℹ Document with resolved references --> Still looking good with the new
model[0m
When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of Google in Sebastian Thrun started .


In [31]:
resolved_doc = nlp(resolve_references(doc2))

In [33]:
msg.info("NER looking good too!")
displacy.render(resolved_doc, style="ent")

[38;5;4mℹ NER looking good too![0m
