In [1]:
from lm_utils import *
from generation_utils import *
import pandas as  pd

In [2]:
lm, lm_vocabulary = load_lm()

In [3]:
rrc_data = pd.read_csv('../data/sentences_RRC.csv')[:5]

In [4]:
rrc_sentences = list(rrc_data.RRC)
rrc_disambiguating = list(rrc_data.disambiguating)

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [17]:
sent = "Even though the girl phoned the instructor was very upset with her for missing a lesson. "
doc = nlp(sent)

for token in doc:
    print(token.text, token.tag_, token.dep_)

Even RB advmod
though IN mark
the DT det
girl NN nsubj
phoned VBD advcl
the DT det
instructor NN nsubj
was VBD ROOT
very RB advmod
upset JJ acomp
with IN prep
her PRP pobj
for IN prep
missing VBG pcomp
a DT det
lesson NN dobj
. . punct


In [45]:
def get_sentences(sent_portion, lm, lm_vocabulary):
    sent_portion = lm_vocabulary.encode(sent_portion)
    sent_portion = torch.LongTensor(sent_portion).unsqueeze(0)
    generated = generate(sent_portion, lm, lm_vocabulary, do_sample = True, 
                   repetition_penalty = 2, num_return_sequences =  10, temperature = 2,
                unknown_penalty = 1000000, top_k = 50, num_beams = 5, max_length = 20)  # generate sequence
    sent_generated = []
    for i in range(len(generated)):
        s = lm_vocabulary.decode(generated[i])
        s = list(nlp(s).sents)[0].text
        if s not in sent_generated:
            sent_generated.append(s)
    sent_generated = sent_generated[:11]
    return sent_generated

sentences_per_item = []
for sent, split in zip(rrc_sentences, rrc_disambiguating):
    sent = sent.replace('.', ' .')
    sent = sent.split()
    sent_portion_1 = ' '.join(sent[:sent.index(split)])
    sent_portion_2 = ' '.join(sent[:sent.index(split)+1])
    if not lm_vocabulary.encode('<unk>')[0] in lm_vocabulary.encode(sent_portion_2):
        sent1 = get_sentences(sent_portion_1, lm, lm_vocabulary)
        sent2 = get_sentences(sent_portion_2, lm, lm_vocabulary)
        print(sent1, sent2)
        sentences_per_item.append((sent1, sent2))

['The experienced waitress cooked the grilled chicken .', 'The experienced waitress cooked the grilled chicken sandwich at a breakfast set .', 'The experienced waitress cooked the grilled chicken as well .', 'The experienced waitress cooked the grilled chicken cake into her heart .', 'The experienced waitress cooked the grilled chicken sandwich to keep it safe overnight .', 'The experienced waitress cooked the grilled chicken dinner together .', 'The experienced waitress cooked the grilled chicken as soon as she had cooked .'] ['The experienced waitress cooked the grilled chicken sent to it every day for a week .', 'The experienced waitress cooked the grilled chicken sent to her head .', 'The experienced waitress cooked the grilled chicken sent to hospital .', 'The experienced waitress cooked the grilled chicken sent .', 'The experienced waitress cooked the grilled chicken sent on .', 'The experienced waitress cooked the grilled chicken sent during the ceremony .', 'The experienced wai

In [7]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("The sleepy volunteers given the hot soup were prepared to pour food .")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

The det volunteers NOUN []
sleepy amod volunteers NOUN []
volunteers nsubj were AUX [The, sleepy, given]
given acl volunteers NOUN [soup]
the det soup NOUN []
hot amod soup NOUN []
soup dobj given VERB [the, hot]
were auxpass prepared ADJ [volunteers]
prepared ROOT prepared ADJ [were, pour, .]
to aux pour VERB []
pour xcomp prepared ADJ [to, food]
food dobj pour VERB []
. punct prepared ADJ []


In [55]:
import pickle as pkl

pkl.dump(sentences_per_item, open('../data/generated.pkl', 'wb'))