https://applied-language-technology.readthedocs.io/en/latest/notebooks/part_iii/02_pattern_matching.html

In [12]:
import wikipedia
wikipedia.set_lang('simple')
dt = wikipedia.page("Donald Trump")
text = dt.summary

In [13]:
text = """
Powerful, low-frequency sound waves could be used to trigger rainfall in areas that suffer from drought, according to a study by researchers at Tsinghua University in Beijing.

In a weather manipulation experiment conducted on the Tibetan Plateau last year, the researchers, led by Professor Wang Guangqian from the university’s State Key Laboratory of Hydro-science and Engineering, said they recorded increases in rainfall of up to 17 per cent by pointing a giant loudspeaker at the sky.

The sound energy might have changed cloud physics, but the cause would require further investigation, researchers said in a peer-reviewed paper published in Scientia Sinica Technologica last week.

Unlike other rainmaking technologies, sound generation produced no chemical pollution and required no “airborne vehicles such as aircraft or rockets”, Wang said. “And there is the possibility of remote control with low cost.”

The experiment is likely to add fuel to the long-running debate in China on the feasibility and environmental impact of large-scale weather modification programmes.

Critics have accused Wang, who proposed the controversial Sky River project to increase rainfall across Tibet by intercepting wet air circulating over the plateau, of wasting taxpayers’ money. Others say that even if the sound stimulation method works, it would create noise pollution for the people and animals that live in the area.

In the study, the rainfall was 11 to 17 per cent higher in areas within the device’s effective range – a radius of about 500 metres from the sound generator – than outside it.

Despite the findings, a researcher from the Institute of Atmospheric Physics under the Chinese Academy of Sciences in Beijing who asked not to be named said Wang’s two-hour experiment would have to be replicated many times to gather more data.

While there has long been speculation that rainfall might be linked to sound – many civilisations perform rain dances in times of drought – the person said there were no physical theories to support the idea.

“The subject remains more of myth than science,” he said.
"""

In [15]:
processed_text = tense_exercise(text, present_perfect=True)
print(processed_text)

Number of blanks:  24
present_tense 17 18 suffer
past_tense 69 70 said
past_tense 71 72 recorded
present_perfect 95 97 have changed
past_tense 109 110 said
past_tense 132 133 produced
past_tense 137 138 required
past_tense 150 151 said
present_tense 155 156 is
present_tense 169 170 is
present_perfect 198 200 have accused
past_tense 203 204 proposed
present_tense 230 231 say
present_tense 238 239 works
present_tense 251 252 live
past_tense 263 264 was
past_tense 316 317 asked
past_tense 321 322 said
present_perfect 343 346 has long been
present_tense 357 358 perform
past_tense 367 368 said
past_tense 369 370 were
present_tense 382 383 remains
past_tense 391 392 said
Powerful, low - frequency sound waves could be used to trigger rainfall in areas that ________________ (suffer) from drought, according to a study by researchers at Tsinghua University in Beijing. 

 In a weather manipulation experiment conducted on the Tibetan Plateau last year, the researchers, led by Professor Wang Guangq

In [2]:
def get_greedy_matches(matches):
    
    intervals = [[match[1],match[2]] for match in matches]
    
    sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
    merged = []

    for higher in sorted_by_lower_bound:
        if not merged:
            merged.append(higher)
        else:
            lower = merged[-1]
            # test for intersection between lower and higher:
            # we know via sorting that lower[0] <= higher[0]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                merged[-1] = [lower[0], upper_bound]  # replace by merged interval
            else:
                merged.append(higher)
    indices = []
    errors = 0
    error_merges = []
    
    for merge in merged:
        try:
            indices.append(intervals.index(merge))
        except:
            errors += 1
            error_merges.append(merge)
            
#     print(str(errors) + " errors occured. The erroroneous merge is " + str(error_merges))
    
    new_matches = [matches[index] for index in indices]
    
    return new_matches

def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

In [None]:
'''
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
Tenses: 
Present: VBP/VBZ
Present continuous: am/is/are/'m/'s/'re + (RB) + VG
Past: VBD
Past continuous: was/were + (RB) + VG
Future: will/wo/shall + (RB) + VB
Future II: am/is/are/'m/'s/'re + (RB) + going + to + VB
Future continuous: will/wo/'ll + (RB) + be + (RB) + VG
Present perfect: have/has/'ve' + (RB) + VBN
Present perfect continuous: have/has + (RB) + been + (RB) + VG
Past perfect: had + (RB) + VBN
Past perfect continuous: had + (RB) + been + (RB) + VG
Future perfect: will/wo/'ll + (RB) + have + (RB) + VBN
Future perfect continuous: will/wo + (RB) + been + (RB) + VG
Modal verbs: can/could/may/might/should/must/would/'d + (RB) + VB + (RB) + V(any)
(Actually include future tenses)
(*need to handle tenses myself)
'''

In [3]:
import spacy
from spacy.matcher import Matcher
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer
# from nltk import word_tokenize

def tense_exercise(text,present_tense=True,past_tense=True,future_tense=True,
                  present_continuous=True,past_continuous=True,future_continuous=False,
                  present_perfect=True,past_perfect=False,future_perfect=False,
                  present_perfect_continuous=False,past_perfect_continuous=False,
                  future_perfect_continuous=False):
    
    tense_bool_dict = {"present_tense":present_tense,
                       "past_tense":past_tense,
                       "future_tense":future_tense,
                       "present_continuous":present_continuous,
                       "past_continuous":past_continuous,
                       "future_continuous":future_continuous,
                       "present_perfect":present_perfect, 
                       "past_perfect":past_perfect,
                       "future_perfect":future_perfect,
                       "present_perfect_continuous":present_perfect_continuous,
                       "past_perfect_continuous":past_perfect_continuous,
                       "future_perfect_continuous":future_perfect_continuous}
    
# Remove parentheses and the contents contained in them
    text = re.sub(r'\([^)]*\)', '',text)
    nlp=spacy.load("en_core_web_sm")

    tenses_priority = ["future_perfect_continuous","past_perfect_continuous",
                      "present_perfect_continuous", "future_perfect", "past_perfect",
                      "present_perfect", "future_continuous", "past_continuous",
                      "present_continuous", "future_tense", "past_tense", "present_tense", '']

    not_list = ["not", "n\'t"]
    is_list = ["is","am","are","\'s","\'re","ain\'t"]
    is_list_q = ["is", "am", "are"]
    do_list = ["do", "does"]
    was_list = ["was","were"]
    will_list = ["will","wo","shall"]
    have_list = ["have","\'ve","has","\'s"]
    adv_dict = {"TAG":"RB","OP":"*"}
    noun_dict = {"TAG":{"IN":["PRP","NN","NNP"]}}

    blank = '_'*16

    matcher = Matcher(nlp.vocab)
    present_tense_pattern = [[{"TAG":{"IN":["VBP","VBZ"]}},adv_dict,{"TAG":"VB", "OP":"*"}],
                    [{"LOWER":{"IN":do_list}},adv_dict,noun_dict,adv_dict,{"TAG":"VB"}],
                    [{"LOWER":{"IN":is_list_q}},adv_dict,noun_dict]]

    past_tense_pattern = [[{"LOWER":"did"},adv_dict,{"TAG":"VB"}],
                        [{"TAG":"VBD"}],
                         [{"LOWER":{"IN":was_list+["did"]}},adv_dict,{"TAG":"PRP"},adv_dict]]
    future_tense_pattern = [[{"LOWER":{"IN":will_list}},adv_dict,{"TAG":"VB"}]]
    present_cont_pattern = [[{"LOWER":{"IN":is_list}},adv_dict,{"TAG":"VBG"}]]
    past_cont_pattern = [[{"LOWER":{"IN":was_list}},adv_dict,{"TAG":"VBG"}]]
    future_cont_pattern = [[{"LOWER":{"IN":will_list}},adv_dict,
                           {"LOWER":"be"},adv_dict,{"TAG":"VBG"}]]
    present_perfect_pattern = [[{"LOWER":{"IN":have_list}},
                                adv_dict,{"TAG":"VBN"}]]
    past_perfect_pattern = [[{"LOWER":"had"},adv_dict,{"TAG":"VBN"}]]
    future_perfect_pattern = [[{"LOWER":{"IN":will_list}},adv_dict,
                              {"LOWER":{"IN":have_list}},adv_dict,{"TAG":"VBN"}]]
    present_perfect_cont_pattern = [[{"LOWER":{"IN":have_list}},adv_dict,
                                     {"LOWER":"been"},adv_dict,
                                     {"TAG":"VBG"}]]
    past_perfect_cont_pattern = [[{"LOWER":"had"},adv_dict,{"LOWER":"been"},
                                  adv_dict,{"TAG":"VBG"}]]
    future_perfect_cont_pattern = [[{"LOWER":{"IN":will_list}},adv_dict,{"LOWER":"have"},
                                    adv_dict,{"LOWER":"been"},adv_dict,
                                    {"TAG":"VBG"}]]



    doc = nlp(text)
    matcher.add("present_tense", present_tense_pattern, greedy='LONGEST')
    # matcher.add("present_tense_question", present_tense_q)
    matcher.add("past_tense", past_tense_pattern, greedy='LONGEST')
    matcher.add("future_tense", future_tense_pattern)
    matcher.add("present_continuous", present_cont_pattern)
    matcher.add("past_continuous", past_cont_pattern)
    matcher.add("future_continuous", future_cont_pattern)
    matcher.add("present_perfect", present_perfect_pattern)
    matcher.add("past_perfect", past_perfect_pattern)
    matcher.add("future_perfect", future_perfect_pattern)
    matcher.add("present_perfect_continuous", present_perfect_cont_pattern)
    matcher.add("past_perfect_continuous", past_perfect_cont_pattern)
    matcher.add("future_perfect_continuous", future_perfect_cont_pattern)

    # replace_word(text, "__________")

    matches = matcher(doc)
    matches.sort(key=lambda x:x[1])

    # print("Original matches:")
    # for match_id, start, end in matches:
    #     string_id = nlp.vocab.strings[match_id]  # Get string representation
    #     span = doc[start:end]  # The matched span
    #     start_char, end_char = span.start_char, span.end_char
    #     print(string_id, start, end, start_char, end_char, span.text)

    # print("\n")

    matches = get_greedy_matches(matches)
    matches_copy = matches
    
    # print("New matches:")
    underline_i = []
    add_bracket = []
    
    print("Number of blanks: ", len(matches))
    
    for idx, (match_id, start, end) in enumerate(matches):
        if not tense_bool_dict[nlp.vocab.strings[match_id]]:
            matches_copy.remove(matches[idx])
    matches = matches_copy
    
    for match_id, start, end in matches:
        lemma = ''
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        print(string_id, start, end, span.text)
        with_not = False
        for idx,token in enumerate(span):
            verbs = []
            if str(token) in not_list:
                with_not = True
            if token.pos_ != 'AUX' and token.pos_ != 'PART' and token.pos_ != 'VERB':
                pass
    #             print("Token:", token)
            elif token.pos_ == 'VERB':
                verbs.append(token)
                lemma = token.lemma_
#                 print(token, lemma)
#                 print(span)
    #             print("Lemma:", lemma)
                underline_i.append(token.i)
            else:
                underline_i.append(token.i)
        if not verbs:
            lemma = 'be'
    #         print("Lemma:", 'be')
            underline_i.append(token.i)
        if idx == len(span)-1:
            if with_not:
                add_bracket.append((token.i,'not ' + str(lemma)))
            else:
                add_bracket.append((token.i,str(lemma)))

    underline_i = sorted(list(set(underline_i)))
    # print(underline_i)
    # print(add_bracket)
    bracket_idx_list, lemma_list = list(list(zip(*add_bracket))[0]), list(list(zip(*add_bracket))[1])

    tokenized = [str(token) for token in doc]
    # print(tokenized)
    for i,word in enumerate(tokenized):
        if i in bracket_idx_list:
            tokenized[i] = blank + '(' + lemma_list[bracket_idx_list.index(i)] + ')'
        elif i in underline_i:
            tokenized[i] = blank
            
    processed_text = untokenize(tokenized)

#     processed_text = TreebankWordDetokenizer().detokenize(tokenized)

    # Combine the neighboring blanks
    processed_text = re.sub(r'(' + blank + ' |' + blank + ')+', blank+' ', processed_text)
    return processed_text





In [57]:
pattern = [[{"TAG":{"IN":['VBZ','VBP']}},adv_dict,{"POS":"VERB","OP":"!"}]]

text = "I have not done yet."
doc = nlp(text)
matcher2 = Matcher(nlp.vocab)
matcher2.add("pattern",pattern)
# help(matcher2)
matches = matcher2(doc, as_spans=True)
help(matches[0])
# for match_id, start, end in matches:
#     string_id = nlp.vocab.strings[match_id]  # Get string representation
#     span = doc[start:end]  # The matched span
#     print(string_id, start, end, span.text)

Help on Span object:

class Span(builtins.object)
 |  A slice from a Doc object.
 |  
 |  DOCS: https://spacy.io/api/span
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getitem__(...)
 |      Get a `Token` or a `Span` object
 |      
 |      i (int or tuple): The index of the token within the span, or slice of
 |          the span to get.
 |      RETURNS (Token or Span): The token at `span[i]`.
 |      
 |      DOCS: https://spacy.io/api/span#getitem
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __hash__(self, /)
 |      Return hash(self).
 |  
 |  __iter__(...)
 |      Iterate over `Token` objects.
 |      
 |      YIELDS (Token): A `Token` object.
 |      
 |      DOCS: https://spacy.io/api/span#iter
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __len__(...)
 |      Get the number of tokens in the span.
 |      
 |      RETURN

In [22]:
nlp=spacy.load("en_core_web_sm")
doc = nlp("I take part in it.")

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_tag = token.tag_
    token_dep = token.dep_
    token_morph = token.morph
    token_lemma = token.lemma_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_tag:<10}{token_dep:<10}{token_lemma:<10}")
    print(token_morph)

I           PRON      PRP       nsubj     I         
Case=Nom|Number=Sing|Person=1|PronType=Prs
take        VERB      VBP       ROOT      take      
Tense=Pres|VerbForm=Fin
part        NOUN      NN        dobj      part      
Number=Sing
in          ADP       IN        prep      in        

it          PRON      PRP       pobj      it        
Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs
.           PUNCT     .         punct     .         
PunctType=Peri


In [105]:
spacy.explain("PART")

'particle'