In [1]:
from pytorch_transformers import BertTokenizer

In [2]:
exec(open("../deps/MASS/MASS-summarization/mass/bert_dictionary.py").read())

In [3]:
bpe = BertTokenizer.from_pretrained('bert-base-uncased')

In [112]:
wp_tokens = bpe.tokenize("The violent clashes that broke out between fans of football rivals Brondby and FC Copenhagen last month have now led to criminal charges against 44 men, Ritzau reported on Thursday. [SEP] And something else. [MASK]")

In [65]:
wp_tokens

['the',
 'violent',
 'clashes',
 'that',
 'broke',
 'out',
 'between',
 'fans',
 'of',
 'football',
 'rivals',
 'bro',
 '##nd',
 '##by',
 'and',
 'fc',
 'copenhagen',
 'last',
 'month',
 'have',
 'now',
 'led',
 'to',
 'criminal',
 'charges',
 'against',
 '44',
 'men',
 ',',
 'ri',
 '##tz',
 '##au',
 'reported',
 'on',
 'thursday',
 '.']

In [113]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The violent clashes that broke out between fans of football rivals Brondby and FC Copenhagen last month have now led to criminal charges against 44 men, Ritzau reported on Thursday. [SEP] And something else. [MASK]")

spacy_tokens = [token.text.lower() for token in doc]

spacy_tokens

['the',
 'violent',
 'clashes',
 'that',
 'broke',
 'out',
 'between',
 'fans',
 'of',
 'football',
 'rivals',
 'brondby',
 'and',
 'fc',
 'copenhagen',
 'last',
 'month',
 'have',
 'now',
 'led',
 'to',
 'criminal',
 'charges',
 'against',
 '44',
 'men',
 ',',
 'ritzau',
 'reported',
 'on',
 'thursday',
 '.',
 '[',
 'sep',
 ']',
 'and',
 'something',
 'else',
 '.',
 '[',
 'mask',
 ']']

In [64]:
nlp(' '.join([clean_wp_token(wp) for wp in wp_tokens]))

the violent clashes that broke out between fans of football rivals bro nd by and fc copenhagen last month have now led to criminal charges against 44 men , ri tz au reported on thursday .

In [102]:
def clean_wp_token(token):
    return token.replace("##", "", 1).strip()

def flatten_list(nested):
    """Flatten a nested list."""
    flat = []
    for x in nested:
        flat.extend(x)
    return flat

In [103]:
import numpy as np
def align_word_pieces(spacy_tokens, wp_tokens, retry=True):
    """Align tokens against word-piece tokens. The alignment is returned as a
    list of lists. If alignment[3] == [4, 5, 6], that means that spacy_tokens[3]
    aligns against 3 tokens: wp_tokens[4], wp_tokens[5] and wp_tokens[6].
    All spaCy tokens must align against at least one element of wp_tokens.
    """
    spacy_tokens = list(spacy_tokens)
    wp_tokens = list(wp_tokens)
    if not wp_tokens:
        return [[] for _ in spacy_tokens]
    elif not spacy_tokens:
        return []
    # Check alignment
    spacy_string = "".join(spacy_tokens).lower()
    wp_string = "".join(wp_tokens).lower()
    if not spacy_string and not wp_string:
        return None
    if spacy_string != wp_string:
        if retry:
            # Flag to control whether to apply a fallback strategy when we
            # don't align, of making more aggressive replacements. It's not
            # clear whether this will lead to better or worse results than the
            # ultimate fallback strategy, of calling the sub-tokenizer on the
            # spaCy tokens. Probably trying harder to get alignment is good:
            # the ultimate fallback actually *changes what wordpieces we
            # return*, so we get (potentially) different results out of the
            # transformer. The more aggressive alignment can only change how we
            # map those transformer features to tokens.
            spacy_tokens = [alpha_re.sub("", t) for t in spacy_tokens]
            wp_tokens = [alpha_re.sub("", t) for t in wp_tokens]
            spacy_string = "".join(spacy_tokens).lower()
            wp_string = "".join(wp_tokens).lower()
            if spacy_string == wp_string:
                return _align(spacy_tokens, wp_tokens)
        # If either we're not trying the fallback alignment, or the fallback
        # fails, we return None. This tells the wordpiecer to align by
        # calling the sub-tokenizer on the spaCy tokens.
        return None
    output = _align(spacy_tokens, wp_tokens)
    if len(set(flatten_list(output))) != len(wp_tokens):
        return None
    return output


def _align(seq1, seq2):
    # Map character positions to tokens
    map1 = _get_char_map(seq1)
    map2 = _get_char_map(seq2)
    # For each token in seq1, get the set of tokens in seq2
    # that share at least one character with that token.
    alignment = [set() for _ in seq1]
    unaligned = set(range(len(seq2)))
    for char_position in range(map1.shape[0]):
        i = map1[char_position]
        j = map2[char_position]
        alignment[i].add(j)
        if j in unaligned:
            unaligned.remove(j)
    # Sort, make list
    output = [sorted(list(s)) for s in alignment]
    # Expand alignment to adjacent unaligned tokens of seq2
    for indices in output:
        if indices:
            while indices[0] >= 1 and indices[0] - 1 in unaligned:
                indices.insert(0, indices[0] - 1)
            last = len(seq2) - 1
            while indices[-1] < last and indices[-1] + 1 in unaligned:
                indices.append(indices[-1] + 1)
    return output

def _get_char_map(seq):
    char_map = np.zeros((sum(len(token) for token in seq),), dtype="i")
    offset = 0
    for i, token in enumerate(seq):
        for j in range(len(token)):
            char_map[offset + j] = i
        offset += len(token)
    return char_map


In [117]:
    def align_tokens(doc, wp_tokens, *, offset=0):
        spacy_tokens = [w.text.rstrip().lower() for w in doc]
        new_wp_tokens = [clean_wp_token(t) for t in wp_tokens]
        assert len(wp_tokens) == len(new_wp_tokens)
        align = align_word_pieces(spacy_tokens, new_wp_tokens, retry=False)
        for indices in align:
            for i in range(len(indices)):
                indices[i] += offset
        return wp_tokens, align

In [128]:
_, alignments = align_tokens(doc, wp_tokens)

In [120]:
for alignment, i in len(alignments:
    print(alignment)

ValueError: too many values to unpack (expected 2)

In [126]:
doc[1].ent_type

0

In [129]:
alignments

[[0],
 [1],
 [2],
 [3],
 [4],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11, 12, 13],
 [14],
 [15],
 [16],
 [17],
 [18],
 [19],
 [20],
 [21],
 [22],
 [23],
 [24],
 [25],
 [26],
 [27],
 [28],
 [29, 30, 31],
 [32],
 [33],
 [34],
 [35],
 [36],
 [36],
 [36],
 [37],
 [38],
 [39],
 [40],
 [41],
 [41],
 [41]]

In [68]:
d = BertDictionary.load_from_file('../datasets/cnndm/dict.src.txt')

In [74]:
idx = [d.index(token) for token in wp_tokens]
d.string(idx, ' ##')

'the violent clashes that broke out between fans of football rivals brondby and fc copenhagen last month have now led to criminal charges against 44 men , ritzau reported on thursday .'

In [111]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("the violent clashes that broke out between fans of football rivals brondby and fc copenhagen last month have now led to criminal charges against 44 men , ritzau reported on thursday . google was not amused at all and wanted to sue the united states of america. [SEP]")

for token in doc:
    print(token.text, token.ent_iob, token.ent_type_)

the 2 
violent 2 
clashes 2 
that 2 
broke 2 
out 2 
between 2 
fans 2 
of 2 
football 2 
rivals 2 
brondby 2 
and 2 
fc 2 
copenhagen 2 
last 3 DATE
month 1 DATE
have 2 
now 2 
led 2 
to 2 
criminal 2 
charges 2 
against 2 
44 3 CARDINAL
men 2 
, 2 
ritzau 2 
reported 2 
on 2 
thursday 3 DATE
. 2 
google 2 
was 2 
not 2 
amused 2 
at 2 
all 2 
and 2 
wanted 2 
to 2 
sue 2 
the 2 
united 2 
states 2 
of 2 
america 2 
. 2 
[ 2 
SEP 3 ORG
] 2 


In [60]:
ENTITY_TYPES = {
    'PERSON': 1,
}


In [61]:
ENTITY_TYPES['PERSON']

1

In [88]:
probs = {w.prob: w.orth for w in nlp.vocab}
usually_titled = [w for w in nlp.vocab if w.is_title and probs.get(w.lower, -10000) < probs.get(w.orth, -10000)]

In [85]:
usually_titled

[]

In [89]:
for w in nlp.vocab:
    print(w.orth)

17780520906925867008
2112642640949226496
9616619598791593984
8872574631799229440
881660621787532292
16946338003326290949
9753589711503188996
16863710723590747144
520659760401972234
4956802221589171210
16935887339986883598
9928036205409584144
8532415787641010193
13253649979635857425
16455024788568498194
957646191493309460
12493136043797776404
68268922348150806
13215729906341848089
10065896487860850714
12579384389446384672
10239237003504588839
9135761513799055405
93523526525271091
14224400212521872438
1635925671746308151
6580322711908478009
16030846251984615483
17253057192755562556
15944149891796232253
16011641530158155841
3411606890003347522
3439263888874234947
4499179194882955333
11015993923340501061
14348061120257369157
9747367433533540424
14803285143452237901
9720490137265826897
7859011591137717335
11030505694439315546
16318918034475841628
8448345182024841309
8818141632269523039
14182082159500669027
7040491189314629732
1147041772600080486
11667289587015813222
2934195947809526888
1677

In [1]:
import torch

In [2]:
a = torch.randn(5, 10, dtype=torch.double)

In [3]:
a

tensor([[ 2.1162, -0.6741,  1.1532,  0.2633, -1.0991,  1.3231,  0.8337,  0.4193,
         -0.7102, -1.6132],
        [ 0.6882, -0.8861,  0.7463,  0.5322, -0.9053,  2.6173, -2.0726, -0.0166,
         -1.6143,  1.1444],
        [ 0.2064, -0.4998, -0.5279,  0.6413,  0.9586, -0.2006, -0.3313, -0.8618,
         -0.0975, -0.2609],
        [ 0.0966,  1.1799,  0.1677, -0.4408,  1.0367,  0.5645, -0.3921, -0.2090,
          0.3517,  0.3711],
        [-0.5567, -1.2093,  0.5622,  0.8603,  3.3869, -1.5603,  0.1894,  0.4290,
          0.3514,  0.6443]], dtype=torch.float64)

In [4]:
a[:, -1:]

tensor([[-1.6132],
        [ 1.1444],
        [-0.2609],
        [ 0.3711],
        [ 0.6443]], dtype=torch.float64)

In [8]:
a[:, -1:]

tensor([[-1.6132],
        [ 1.1444],
        [-0.2609],
        [ 0.3711],
        [ 0.6443]], dtype=torch.float64)

In [19]:
for i in range(len(a)):
    print(a[i])

tensor([ 2.1162, -0.6741,  1.1532,  0.2633, -1.0991,  1.3231,  0.8337,  0.4193,
        -0.7102, -1.6132], dtype=torch.float64)
tensor([ 0.6882, -0.8861,  0.7463,  0.5322, -0.9053,  2.6173, -2.0726, -0.0166,
        -1.6143,  1.1444], dtype=torch.float64)
tensor([ 0.2064, -0.4998, -0.5279,  0.6413,  0.9586, -0.2006, -0.3313, -0.8618,
        -0.0975, -0.2609], dtype=torch.float64)
tensor([ 0.0966,  1.1799,  0.1677, -0.4408,  1.0367,  0.5645, -0.3921, -0.2090,
         0.3517,  0.3711], dtype=torch.float64)
tensor([-0.5567, -1.2093,  0.5622,  0.8603,  3.3869, -1.5603,  0.1894,  0.4290,
         0.3514,  0.6443], dtype=torch.float64)


In [22]:
a[:,1] = 0

In [23]:
a

tensor([[ 2.1162,  0.0000,  1.1532,  0.2633, -1.0991,  1.3231,  0.8337,  0.4193,
         -0.7102, -1.6132],
        [ 0.6882,  0.0000,  0.7463,  0.5322, -0.9053,  2.6173, -2.0726, -0.0166,
         -1.6143,  1.1444],
        [ 0.2064,  0.0000, -0.5279,  0.6413,  0.9586, -0.2006, -0.3313, -0.8618,
         -0.0975, -0.2609],
        [ 0.0966,  0.0000,  0.1677, -0.4408,  1.0367,  0.5645, -0.3921, -0.2090,
          0.3517,  0.3711],
        [-0.5567,  0.0000,  0.5622,  0.8603,  3.3869, -1.5603,  0.1894,  0.4290,
          0.3514,  0.6443]], dtype=torch.float64)

In [25]:
a.shape[1]

10