In [0]:
# !pip install transformers > /dev/null

import torch
from torch import nn
import numpy as np
import collections
torch.manual_seed(1)

from transformers import BertTokenizer

print(torch.__version__)
torch.set_default_tensor_type('torch.FloatTensor')

1.4.0


# 文本的表示

人可以理解象形文字，计算机只能理解二进制。表达问题的步骤有

1. 分词
2. 映射

映射有各种方式，例如
1. 词袋模型
2. tf-idf
3. word2vec
4. bert

In [0]:
class Vocab(object):
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        counter = count_corpus(tokens)
        self.token_freq = list(counter.items())
        self.idx_to_token = []
        if use_special_tokens:
            self.pad, self.bos, self.eos, self.unk = (0,1,2,3)
            self.idx_to_token += ['</s>','<s>','</s>','<unk>']
        else:
            self.unk = 0
            self.idx_to_token += ['<unk>']
        self.idx_to_token += [token for token, freq in self.token_freq if freq >= min_freq and token not in self.idx_to_token]
        self.token_to_idx = dict()
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx
        

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
        
def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st]
    return collections.Counter(tokens)

In [0]:
voc = Vocab(tokens)

In [0]:
voc.to_tokens(10)

'('

In [0]:
txt = """
The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. His pale grey eyes shone and twinkled, and his usually pale face was flushed and animated. The fire burnt brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses. Our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-dinner atmosphere, when thought runs gracefully free of the trammels of precision. And he put it to us in this way—marking the points with a lean forefinger—as we sat and lazily admired his earnestness over this new paradox (as we thought it) and his fecundity.
“You must follow me carefully. I shall have to controvert one or two ideas that are almost universally accepted. The geometry, for instance, they taught you at school is founded on a misconception.”
“Is not that rather a large thing to expect us to begin upon?” said Filby, an argumentative person with red hair.
“I do not mean to ask you to accept anything without reasonable ground for it. You will soon admit as much as I need from you. You know of course that a mathematical line, a line of thickness nil, has no real existence. They taught you that? Neither has a mathematical plane. These things are mere abstractions.”
"""


In [0]:
lines = txt.split("\n")[1:-1]

In [0]:
len(lines)

4

In [0]:
lines[0]

'The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us. His pale grey eyes shone and twinkled, and his usually pale face was flushed and animated. The fire burnt brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses. Our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-dinner atmosphere, when thought runs gracefully free of the trammels of precision. And he put it to us in this way—marking the points with a lean forefinger—as we sat and lazily admired his earnestness over this new paradox (as we thought it) and his fecundity.'

In [0]:
tokens = lines[0].split(' ')

In [0]:
print(tokens[:10])

['The', 'Time', 'Traveller', '(for', 'so', 'it', 'will', 'be', 'convenient', 'to']


In [0]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [0]:
s = "Mr. Chen doesn't agree with my suggestion."

doc = nlp(s)
print([t.text for t in doc])

['Mr.', 'Chen', 'does', "n't", 'agree', 'with', 'my', 'suggestion', '.']


In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
print(tokenizer.tokenize(s))

['mr', '.', 'chen', 'doesn', "'", 't', 'agree', 'with', 'my', 'suggestion', '.']


# 语言模型

词之间都是相关联的，也就是根据context，可以预测mask，根据当前的句子，可以预测下一个句子

## n元语法

当前词的出现，只和前面n个词相关，n阶马尔科夫链

1元语法 unigram
2元语法 bigram 2个词
3元语法 trigram

一个长度为4的序列，上述3种表述为

$\begin{aligned}
P(w_1, w_2, w_3, w_4) &=  P(w_1) P(w_2) P(w_3) P(w_4) ,\\
P(w_1, w_2, w_3, w_4) &=  P(w_1) P(w_2 \mid w_1) P(w_3 \mid w_2) P(w_4 \mid w_3) ,\\
P(w_1, w_2, w_3, w_4) &=  P(w_1) P(w_2 \mid w_1) P(w_3 \mid w_1, w_2) P(w_4 \mid w_2, w_3) .
\end{aligned}$

n小不准确，n大复杂度大