In [1]:
import spacy
from pathlib import Path
import pandas as pd
import numpy as np
from spacy import displacy
from functools import reduce

# Loading Writing Prompts Corpus

In [2]:
wp_path = "corpora/writingPrompts/"
wp_data = ["train", "test", "valid"]

wp_prompts = []
wp_stories = []
wp_tags = []
wp_masks = []
for name in wp_data: 
    wp_source_path = Path(wp_path) / f"{name}.wp_source"
    wp_target_path = Path(wp_path) / f"{name}.wp_target"

    with open(wp_source_path, 'r') as fd:
        wp_source = fd.readlines()

    for line in wp_source:
        wp_tags.append(line[:6])
        wp_prompts.append(line[7:])
    
    with open(wp_target_path, 'r') as fd:
        wp_target = fd.readlines()

    wp_stories.extend(wp_target)
    
    wp_masks.extend([name] * len(wp_source))

wp_masks_np = np.array(wp_masks)

In [3]:
# Later try to use spacy features for that.
# Undo the funky tokenization
replaces = ((' . ', '. '),
            (' .', '.'),
            (' ; ', '; '),
            (' : ', ': '),
            (' ! ', '! '),
            (' !', '!'),
            (' ? ', '? '),
            (' ?', '?'),
            (' *', '*'),
            (' , ', ', '),
            (' ’ ', '’'),
            (" 's", "'s"),
            (" n't", "n't"),
            (" N'T", "N'T"),
            (" 've", "'ve"),
            (" 'd", "'d"),
            ("`` ", "``"),
            (" ''", "''"),
            ("“ ", "“"),
            (" ”", "”"),
            (" ’re", "’re"),
            (" 're", "'re"),
            (" ’m", "’m"),            
            (" 'm", "'m"),
            ('<newline> ', '\n'),
            (' <newline>', '\n'),
            ('<newline>', '\n'),            
            (' \n', '\n'),
           )

wp_stories = [reduce(lambda a, kv: a.replace(*kv), replaces, story) for story in wp_stories]

In [4]:
story_len = [len(story) for story in wp_stories]

In [5]:
max_pos = story_len.index(max(story_len))
min_pos = story_len.index(min(story_len))
pick = story_len.index(800)

In [6]:
print(wp_stories[pick])

Samanta was in a dark coat: long, silky, and tight on womanly hips. My doorway was too drab for such a work of art. I was struck with want for a coat of paint. I swung my chair away.

“Fancy that, huh? I found you.” That ghost sang to my back. I didn’t turn.

“Lucky as usual. I told you, I don’t do that work nowadays.” It wasn’t fiction. I’d hung up that old coat, that old man. My guns, my cigars, that was history. Old gangs, young thugs, any and all could run this town. That optimistic young P.I. was kaput. I told Samantha so.

“What a buffoon.” Sam said, ruby lips all wrong in a smirk. “You can’t quit. Not any of us can quit.”

I didn’t say a word. Samantha slid away, but not without dropping a gift: familiar manila, that slanting, taunting handwriting in all black.*Do what you must.*





In [7]:
nlp = spacy.load("en_core_web_trf")

In [8]:
analysis = nlp.analyze_pipes(pretty=True)

[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   transformer       doc._.trf_data                                      False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

In [9]:
# Se precisar de um Tokenizer customizado...
#from spacy.attrs import ORTH, NORM
#from spacy.tokenizer import Tokenizer
#tokenizer = Tokenizer(nlp.vocab)
#case = [{ORTH: "do"}, {ORTH: "n't", NORM: "not"}]
#tokenizer.add_special_case("don't", case)

In [10]:
text = wp_stories[pick]

In [11]:
doc = nlp(text)

In [12]:
# for sent in doc.sents:
#     for token in sent:
#         print(f"{token.i} {token}", end=" ")
#     print()

In [13]:
sents = list(doc.sents)
chunks = list(doc.noun_chunks)

In [14]:
for token in sents[0]:
    print(token.i, token, token.lemma_, token.pos_, token.tag_, token.dep_)

0 Samanta Samanta PROPN NNP nsubj
1 was be AUX VBD ROOT
2 in in ADP IN prep
3 a a DET DT det
4 dark dark ADJ JJ amod
5 coat coat NOUN NN pobj
6 : : PUNCT : punct
7 long long ADJ JJ acomp
8 , , PUNCT , punct
9 silky silky ADJ JJ conj
10 , , PUNCT , punct
11 and and CCONJ CC cc
12 tight tight ADJ JJ conj
13 on on ADP IN prep
14 womanly womanly ADJ JJ amod
15 hips hip NOUN NNS pobj
16 . . PUNCT . punct


In [15]:
print(doc.text)

Samanta was in a dark coat: long, silky, and tight on womanly hips. My doorway was too drab for such a work of art. I was struck with want for a coat of paint. I swung my chair away.

“Fancy that, huh? I found you.” That ghost sang to my back. I didn’t turn.

“Lucky as usual. I told you, I don’t do that work nowadays.” It wasn’t fiction. I’d hung up that old coat, that old man. My guns, my cigars, that was history. Old gangs, young thugs, any and all could run this town. That optimistic young P.I. was kaput. I told Samantha so.

“What a buffoon.” Sam said, ruby lips all wrong in a smirk. “You can’t quit. Not any of us can quit.”

I didn’t say a word. Samantha slid away, but not without dropping a gift: familiar manila, that slanting, taunting handwriting in all black.*Do what you must.*





In [16]:
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
#displacy.render(sents, style="dep", options=options, jupyter=True)

In [17]:
fmt = "{:15} {:10} {:10} {:15} {}"
print(fmt.format("Text", "Root Text", "Root Dep", "Root Head Text", "Children of head"))
for chunk in chunks[:4]:
    print(fmt.format(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text, [child for child in chunk.root.head.children] ))

Text            Root Text  Root Dep   Root Head Text  Children of head
Samanta         Samanta    nsubj      was             [Samanta, in, :, long, .]
a dark coat     coat       pobj       in              [coat]
womanly hips    hips       pobj       on              [hips]
My doorway      doorway    nsubj      was             [doorway, drab, .]


In [20]:
trfdata = doc._.trf_data

In [27]:
trfdata.tensors[0].shape

(2, 150, 768)

In [28]:
trfdata.tensors[1].shape

(2, 768)

In [30]:
print(trfdata.tokens['input_texts'][0][:10])
print(trfdata.tokens['input_texts'][0][-10:])
print(trfdata.tokens['input_texts'][1][:10])

['<s>', 'Sam', 'anta', 'Ġwas', 'Ġin', 'Ġa', 'Ġdark', 'Ġcoat', ':', 'Ġlong']
['Ġyoung', 'Ġthugs', ',', 'Ġany', 'Ġand', 'Ġall', 'Ġcould', 'Ġrun', 'Ġthis', '</s>']
['<s>', 'hung', 'Ġup', 'Ġthat', 'Ġold', 'Ġcoat', ',', 'Ġthat', 'Ġold', 'Ġman']


In [19]:
doc[:10]

Samanta was in a dark coat: long, silky

In [40]:
trfdata.tokens['input_ids'][0][1]

tensor(21169)

In [137]:
# Show all entries that are like the given id
given_id = 14
token_locations = np.where(trfdata.wordpieces.input_ids[0] == given_id)[0]

In [138]:
tokens_on_doc = np.array(trfdata.wordpieces.strings[0])[token_locations]
tokens_on_doc

array(['Ġthat', 'Ġthat', 'Ġthat', 'Ġthat', 'Ġthat'], dtype='<U9')

In [142]:
len(trfdata.tokens['input_texts'][0])

150

In [83]:
print(tokens_on_doc[0])

Ċ


In [145]:
np.all(np.array(trfdata.wordpieces.strings) == np.array(trfdata.tokens['input_texts']))

True

In [147]:
np.all(trfdata.align.dataXd == trfdata.align.data[:,0])

True

In [160]:
aligns = trfdata.align.dataXd
lengths = trfdata.align.lengths
for pos,length in zip(aligns,lengths):
    print(doc[pos-1:pos+length-1], end=" ")

Samanta was was in a dark coat : long , silky, , and tight on womanly hips hips . My doorway was too drab for for such a work of art . I was struck with want for a coat of paint . I swung my chair away . 

“ “Fancy Fancy that that , huh ? I found you . ” That That ghost sang to my back . I did n’t turn.

 turn . 

“ “Lucky Lucky as as usual usual . I told you , I don’t do that n’t do that work nowadays .” ” It wasn’t fiction. n’t fiction . I’d hung I’d ’d hung hung up up that that old old coat coat, , that that old old man man. . My My guns My guns guns, , my my cigars cigars, , that that was was history history. . Old Old gangs said, gangs, , ruby , young ruby lips young thugs lips all thugs, all , wrong any in and all could run a all could run smirk could . run “ this You ca town. ca . n’t quit That quit. optimistic . young Not P.I. any was of kaput us can . can quit I quit told Samantha so. . Samantha ” so 

 . I 

 did “What n’t say What say a buffoon.” a buffoon word . . Samantha 

In [161]:
doc.text

'Samanta was in a dark coat: long, silky, and tight on womanly hips. My doorway was too drab for such a work of art. I was struck with want for a coat of paint. I swung my chair away.\n\n“Fancy that, huh? I found you.” That ghost sang to my back. I didn’t turn.\n\n“Lucky as usual. I told you, I don’t do that work nowadays.” It wasn’t fiction. I’d hung up that old coat, that old man. My guns, my cigars, that was history. Old gangs, young thugs, any and all could run this town. That optimistic young P.I. was kaput. I told Samantha so.\n\n“What a buffoon.” Sam said, ruby lips all wrong in a smirk. “You can’t quit. Not any of us can quit.”\n\nI didn’t say a word. Samantha slid away, but not without dropping a gift: familiar manila, that slanting, taunting handwriting in all black.*Do what you must.*\n\n\n'

In [151]:
trfdata.tensors[0].shape

(2, 150, 768)

In [166]:
np.all(trfdata.model_output['last_hidden_state'] == trfdata.tensors[0])

True

In [168]:
np.array(trfdata.wordpieces.strings).shape

(2, 150)

In [169]:
trfdata.tensors[0].shape

(2, 150, 768)

In [26]:
doc[sp.start:sp.end]

the chair

In [27]:
type(sents[0])

spacy.tokens.span.Span

In [28]:
# Extract chunks from the ith-sentence
ith = 1
sentence = sents[ith]
chunksents = []
for chunk in chunks:
    if chunk.start < sentence.start:
        continue
    if chunk.end > sentence.end:
        break
    else:
        print(sentence.start, chunk.start, chunk.end, sentence.end)
        chunksents.append(chunk)

print(sentence)
print(chunksents)

14 16 17 31
14 22 24 31
14 25 27 31
14 28 29 31
Four of them were mounted a foot below ceiling height, all screens showed employees working.
[them, ceiling height, all screens, employees]


In [30]:
sents[0]

He sat back in the chair, looking at the monitors behind me.

In [31]:
sents[1]

Four of them were mounted a foot below ceiling height, all screens showed employees working.

In [32]:
fmt = "{:10} {}"
for token in sents[1]:
    print(fmt.format(token.text, token.dep_))

Four       nsubjpass
of         prep
them       pobj
were       auxpass
mounted    ccomp
a          det
foot       npadvmod
below      prep
ceiling    compound
height     pobj
,          punct
all        det
screens    nsubj
showed     ROOT
employees  nsubj
working    ccomp
.          punct


In [33]:
for token in doc:
    if (token.i+1) < len(doc) and doc[token.i + 1].is_sent_start:
        print(token)

.
.
.
*
*
.
*
.
''
.
.
''
''
.
.
''
*
.
.
.
.
.
.
!
!
''
.
''
)
.
''
.
.
.
.
''
.
''
.
''
*
''
.
