**In the approach, we would like to use the Matcher function as a starter.**


In [38]:
# one of the tools that suggested by Spacy is to get wordnet, so that we can retrieve synonyms of "say"-type verb

# suggest by https://pypi.org/project/spacy-wordnet/

!pip install spacy-wordnet





In [39]:
import nltk
>>> nltk.download('wordnet')

import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hanzhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [40]:
nlp = spacy.load('en_core_web_sm')

In [41]:


# spacy 3.X

nlp.add_pipe("spacy_wordnet", after='tagger') 

# spacy 2.X

nlp.add_pipe(WordnetAnnotator(nlp, name="spacy_wordnet"), after='tagger')

wordnet_annotator = WordnetAnnotator()
nlp.add_pipe(wordnet_annotator, after='tagger')

# the website suggested the two models and we implemented them

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy_wordnet.wordnet_annotator.WordnetAnnotator object at 0x2a3ee8a90> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [42]:
# define the variable, we speculate that if we put more "quoting verbs" in the dict, it is going to create a bigger list
token = nlp('say')[0]


In [43]:
# The "extract" function: system sets and lemmas
token._.wordnet.synsets()
token._.wordnet.lemmas()



[Lemma('say.n.01.say'),
 Lemma('state.v.01.state'),
 Lemma('state.v.01.say'),
 Lemma('state.v.01.tell'),
 Lemma('allege.v.01.allege'),
 Lemma('allege.v.01.aver'),
 Lemma('allege.v.01.say'),
 Lemma('suppose.v.01.suppose'),
 Lemma('suppose.v.01.say'),
 Lemma('read.v.02.read'),
 Lemma('read.v.02.say'),
 Lemma('order.v.01.order'),
 Lemma('order.v.01.tell'),
 Lemma('order.v.01.enjoin'),
 Lemma('order.v.01.say'),
 Lemma('pronounce.v.01.pronounce'),
 Lemma('pronounce.v.01.articulate'),
 Lemma('pronounce.v.01.enounce'),
 Lemma('pronounce.v.01.sound_out'),
 Lemma('pronounce.v.01.enunciate'),
 Lemma('pronounce.v.01.say'),
 Lemma('say.v.07.say'),
 Lemma('say.v.08.say'),
 Lemma('say.v.09.say'),
 Lemma('say.v.10.say'),
 Lemma('say.v.11.say')]

In [44]:
token._.wordnet.wordnet_domains()
# to check which domain might fit better for finding synonyms

['literature',
 'book_keeping',
 'pedagogy',
 'factotum',
 'psychiatry',
 'person',
 'sociology',
 'philosophy',
 'publishing',
 'psychological_features',
 'play',
 'linguistics',
 'law',
 'commerce',
 'theatre',
 'theology',
 'philology',
 'psychology',
 'enterprise',
 'mathematics',
 'religion',
 'pure_science',
 'economy',
 'art',
 'tax',
 'quality',
 'telecommunication',
 'grammar',
 'roman_catholic',
 'literature',
 'pedagogy',
 'astronomy',
 'pharmacy',
 'heraldry',
 'politics',
 'philosophy',
 'psychological_features',
 'mythology',
 'school',
 'psychoanalysis',
 'number',
 'post',
 'law',
 'ethnology',
 'theology',
 'psychology',
 'religion',
 'archaeology',
 'paleontology',
 'paranormal',
 'history',
 'occultism',
 'roman_catholic',
 'hunting',
 'artisanship',
 'publishing',
 'music',
 'grammar',
 'acoustics',
 'linguistics',
 'anatomy',
 'mechanics',
 'roman_catholic',
 'roman_catholic',
 'mechanics']

In [45]:
# we selected the "linguistics" and "grammar" domains, we speculate that if we choose more domains, we will get more results
say_related_domains = ['linguistics', 'grammar']
enriched_sentence = []
sentence = nlp('She says that')

# For each token in the sentence
for token in sentence:
    # We get those synsets within the desired domains
    synsets = token._.wordnet.wordnet_synsets_for_domain(say_related_domains)
    if not synsets:
        enriched_sentence.append(token.text)
    else:
        lemmas_for_synset = [lemma for s in synsets for lemma in s.lemma_names()]
        # If we found a synset in the say_related_domains
        # we get the variants and add them to the enriched sentence
        enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))

# Let's see our enriched sentence
print(' '.join(enriched_sentence))

# the two domains only gave us these "quoting verbs" (there are more synonyms that are not listed)

She (tell|sound_out|say|pronounce|enounce|enunciate|state|articulate) that


**It seems that in spacy wordnet, the available synonyms are quite limited, thus we might want to create a Indir_speech_marker_list manully**


In [46]:
# The list only consist of some of the quoting verbs as it is manually made
Indir_speech_marker_list = ["say", "state", "announce", "sound_out" ,"enunciate", "tell", "pronounce", "articulate", "claim", "demonstrate", "mention", "according", "state", "suggest", "find", "note", "cite", "express", "imply", "describe", "acknowledge", "address", "urge", "refer"]

# Process the list of words with Spacy
IS_token_list = [token for word in Indir_speech_marker_list for token in nlp(word)]
print(IS_token_list)

[say, state, announce, sound_out, enunciate, tell, pronounce, articulate, claim, demonstrate, mention, according, state, suggest, find, note, cite, express, imply, describe, acknowledge, address, urge, refer]


In [47]:
#Create a Matcher

from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

#open one text file and try
with open ("A1_data/5c1548a31e67d78e2771624f.txt", "r", encoding='utf-8') as f: 
        text = f.read()



In [48]:
# we try to create a pattern which extract all the sentences that satisfy the following pattern
#   A "messager"--marked as ENT, someone who reiterate the information
# + A "say-type" word within a list of synonyms in the listed we made above


pattern = [{"POS": "PROPN", "OP": "*"}, # looking for a messager
          {"POS": "VERB", "LEMMA":{"IN": Indir_speech_marker_list}}, # find the lemma form of the verb that fits in the list
          {"IS_ALPHA":True, "OP": "*"},
          {"ORTH":{"IN": ["\.","\!", "\?"]}} # until we get to the sentence boundary
           ]
matcher.add ("PROPER_NOUNS", [pattern], greedy = 'LONGEST')


#Matcher start from the entity, pronoun identifier     

In [49]:
# seperate direct and indirect


doc = nlp(text)
matches = matcher(doc)
matches.sort(key = lambda x: [1])
matcher.add ("Indir", [pattern], greedy = 'LONGEST')


print(len(matches))
for match_id, start, end in matches:
    print(doc[start:end])

#    print(len(matches))
#    for match in matches:
   # print(match, doc[match[1]:match[2]])
    
    # oh no this is not working gg :( oops

0


In [50]:
matcher.add("Indir", [pattern])
doc = nlp(text)
matches_IS = matcher(doc)
print(len(Indir))

NameError: name 'Indir' is not defined