In [74]:
import wikipedia
wikipedia.set_lang('simple')
dt = wikipedia.page("Donald Trump")
text = dt.summary

In [23]:
text

'Donald John Trump (born June 14, 1946) is an American politician who was the 45th President of the United States from 2017 to 2021. Before becoming president, he was a businessman and television personality.\nTrump was the chairman and president of The Trump Organization. Much of his money was made in real estate in New York City, Las Vegas, and Atlantic City. He used to own the Miss Universe pageant. He was the star in his own reality show The Apprentice. In October 2019, Trump changed his official residency state from New York to Florida.In June 2015, Trump announced that he would run for President of the United States in the 2016 presidential election, although most people did not expect him to win. Starting mid-July, polls showed that Trump was the front-runner in the Republican field, ahead of Ted Cruz. This was true even after much criticism from his party due to his comments on illegal immigration, Muslims, and ISIS. His campaign gained support from mostly middle-class and rura

In [13]:
import pyinflect
doc = nlp("slowly")
for token in doc:
    print(token._.inflect('JJ'))

slowly


In [1]:
import spacy
from spacy.matcher import Matcher
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re
import random

nlp=spacy.load("en_core_web_sm")

def fill_in_the_blanks_prepositions(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    prep_pattern = [{"TAG": "IN"}]

    matcher.add("preposition", [prep_pattern])

    matches = matcher(doc, as_spans=True)
    prep_list = [str(match).lower() for match in matches]
    random.shuffle(prep_list)

    prep_indices = [match.start for match in matches]

    tokenized = [str(token) for token in doc]

    for idx in prep_indices:
        tokenized[idx] = '__________'

    processed_text = TreebankWordDetokenizer().detokenize(tokenized)
    processed_text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', processed_text)

    print(*prep_list, sep=' ')
    print('\n')
    print(processed_text)


def fill_in_the_blanks_question_words(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    wh_pattern = [{"TAG": {"IN":["WRB", "WDT", "WP"]}}]

    matcher.add("", [wh_pattern])

    matches = matcher(doc, as_spans=True)
    
    # Make sure that it is indeed a question
    questions_intervals = []
    for sent in doc.sents:
        if str(sent).endswith('?'):
            questions_intervals.append([sent.start, sent.end])
            
    real_question_matches = []
    for match in matches:
        for interval in questions_intervals:
            if match.start >= interval[0] and match.end <= interval[1]:
                real_question_matches.append(match)
    
    qword_list = [str(match).capitalize() for match in real_question_matches]
    random.shuffle(qword_list)

    qword_indices = [match.start for match in real_question_matches]

    tokenized = [str(token) for token in doc]

    for idx in qword_indices:
        tokenized[idx] = '__________'
    
    processed_text = TreebankWordDetokenizer().detokenize(tokenized)
    processed_text = re.sub(r'\s([?.!"](?:\s|$))', r'\1', processed_text)

    print(*qword_list, sep=' ')
    print('\n')
    print(processed_text)
    
def fill_in_the_blanks_adjectives(text, comparative=True, superlative=True, adverbs=True):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    adj_orig_pattern = [[{"TAG":"JJ"}]]
    adj_comp_pattern = [[{"TAG":"JJR"}],
                       [{"POS":"ADV","TAG":"RBR"},{"TAG":"JJ"}]]
    adj_sup_pattern = [[{"LOWER":"the"},{"TAG":"JJS"}],
                      [{"POS":"ADV","TAG":"RBS"},{"TAG":"JJ"}]]
    adv_orig_pattern = [[{"POS":"ADV"}]]
    adv_comp_pattern = [[{"POS":"ADV","TAG":"RBR"}],
                        [{"POS":"ADV","TAG":"RBR"},{"POS":"ADV","TAG":"RB"}]]
    adv_sup_pattern = [[{"POS":"ADV", "TAG":"RBS"}],
                      [{"POS":"ADV","TAG":"RBS"},{"POS":"ADV","TAG":"RB"}]]
    
    matcher.add("Adjective", adj_orig_pattern,greedy="LONGEST")
    matcher.add("Adjective (comparative)", adj_comp_pattern, greedy="LONGEST")
    matcher.add("Adjective (superlative)", adj_sup_pattern, greedy="LONGEST")
    matcher.add("Adverb", adv_orig_pattern,greedy="LONGEST")
    matcher.add("Adverb (comparative)", adv_comp_pattern, greedy="LONGEST")
    matcher.add("Adverb (superlative)", adv_sup_pattern, greedy="LONGEST")
        
    matches = matcher(doc)
                      
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span
        print(string_id, start, end, span.text)

#     matches = matcher(doc, as_spans=True)

    


In [81]:
fill_in_the_blanks_adjectives("")

Adjective 12 13 American
Adjective 17 18 45th
Adjective 52 53 Much
Adjective 59 60 real
Adjective 88 89 own
Adjective 101 102 official
Adjective 129 130 presidential
Adjective 153 154 front
Adjective 158 159 Republican
Adjective 168 169 true
Adjective 171 172 much
Adjective 181 182 illegal
Adjective 195 196 middle
Adjective 199 200 rural
Adjective 232 233 presumptive
Adjective 236 237 only
Adjective 268 269 close
Adjective 271 272 former
Adjective 283 284 electoral
Adjective 301 302 45th
Adjective 312 313 old
Adjective 381 382 public
Adjective 396 397 criminal
Adjective 424 425 third
Adjective 427 428 American
Adjective 453 454 former
Adjective 466 467 first
Adjective 501 502 large
Adjective 544 545 few
Adjective 575 576 second
Adjective 580 581 Trump
Adjective 582 583 only
Adjective 598 599 guilty
Adverb 161 162 ahead
Adverb 169 170 even
Adverb 194 195 mostly
Adverb 319 320 ever
Adverb 338 339 later
Adverb 483 484 However
Adverb 486 487 still
Adverb 506 507 also
Adverb 519 520 controv

In [64]:
doc = nlp("I am from Hong Kong. I am not in Germany. Are you from Germany? Yes I am!")
questions_intervals = []
for sent in doc.sents:
    if str(sent).endswith('?'):
        questions_intervals.append([sent.start, sent.end])
        
questions_intervals


[[12, 17]]