In [1]:
import torch
import torch.nn as nn
import os
import re
from tqdm import tqdm

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:

with open('text.txt','r',encoding='utf-8') as f:
  text = f.read()

In [3]:
# 输入的文本
text = """People who truly loved once are far more likely to love again.
Difficult circumstances serve as a textbook of life for people.
The best preparation for tomorrow is doing your best today.
The reason why a great man is great is that he resolves to be a great man.
The shortest way to do many things is to only one thing at a time.
Only they who fulfill their duties in everyday matters will fulfill them on great occasions.
I go all out to deal with the ordinary life.
I can stand up once again on my own.
Never underestimate your power to change yourself."""

def processText(text):
    text = text.lower()
    remove_chars = '[·’!"\#$%&\'()＃！（）*+,-./:;<=>?\@，：?￥★、…．＞【】［］《》？“”‘’\[\\]^_`{|}~]+' # 去除标点符号
    text= re.sub(remove_chars, "", text)
    text = text.split()
    return text


text = processText(text)


In [4]:

# 输入是1*V的矩阵，V是词汇表的大小
EMDEDDING_DIM = 100  # 词向量维度, 词向量的维度一般是50, 100, 200, 300维, W1矩阵的列数

word = set(text)
word_size = len(word) # 词汇表的大小

# 有了这两个映射，我们才能通过输出层预测结果
word_to_ix = {word: ix for ix, word in enumerate(word)}
ix_to_word = {ix: word for ix, word in enumerate(word)}


![img](https://img-blog.csdnimg.cn/img_convert/466542a287c87cb2813cdad5cfc46fc9.png)

In [5]:
# 定义一个函数，把文本转换成索引的形式
def make_context_vector(context, word_to_ix): # context是上下文单词列表，word_to_ix是单词到索引的映射
    idxs = [word_to_ix[w] for w in context] # 将上下文单词转换成索引的形式
    res = torch.tensor(idxs, dtype=torch.long)
    return  res # 返回的是tensor


In [6]:
# 定义训练数据
data = [] # 定义一个列表，用来存储训练数据
for i in range(2, len(text) - 2):
    context = [text[i - 2], text[i - 1], # 根据上下文预测目标词汇，窗口大小为4
               text[i + 1], text[i + 2]]
    target = text[i]
    data.append((context, target))


In [7]:
# 定义模型
class CBOW(torch.nn.Module):
    def __init__(self, word_size, embedding_dim):         # wordsize是词汇表的大小，embedding_dim是词向量的维度
        super(CBOW, self).__init__()

        self.embeddings = nn.Embedding(word_size, embedding_dim) # 定义词向量矩阵
        self.linear1 = nn.Linear(embedding_dim, 128) # 定义第一个线性层
        self.activation_function1 = nn.ReLU() # 定义激活函数，这里用的是ReLU
        self.linear2 = nn.Linear(128, word_size) #  定义第二个线性层
        self.activation_function2 = nn.LogSoftmax(dim=-1) # 定义激活函数，这里用的是LogSoftmax，LogSoftmax可以理解为归一化的概率, dim=-1表示最后一维, 这里是列

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1, -1) # 将上下文词向量求和，然后转换成1*V的形式
        out = self.linear1(embeds) # 1*V 乘以 V*128 得到 1*128
        out = self.activation_function1(out) # 1*128 经过激活函数,
        out = self.linear2(out) # 1*128 乘以 128*V 得到 1*V
        out = self.activation_function2(out) # 1*V 经过激活函数, 得到归一化的概率
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]]) # 将单词转换成索引的形式, 例如"people"转换成tensor([0])
        return self.embeddings(word).view(1, -1) # 返回词向量


In [8]:
# 初始化模型
model = CBOW(word_size, EMDEDDING_DIM) # 词汇表的大小是word_size, 词向量的维度是EMDEDDING_DIM

loss_function = nn.NLLLoss() # 定义损失函数，这里用的是NLLLoss，负对数似然损失函数,NLLLoss的输入是一个对数概率向量和一个目标标签, 它不会为我们计算对数概率, 需要我们自己计算好对数概率
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [9]:

# 开始训练
epochs = tqdm(range(100))
for epoch in epochs:
    total_loss = 0

    for context, target in data:
        context_vector = make_context_vector(context, word_to_ix)

        log_probs = model(context_vector)

        total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]))
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    epochs.set_postfix(epoch=epoch,total_loss=total_loss.item())


100%|██████████| 100/100 [00:02<00:00, 35.73it/s, epoch=99, total_loss=9.61]


In [10]:

# 预测
context1 = ['preparation', 'for', 'is', 'doing']
context_vector1 = make_context_vector(context1, word_to_ix)
a = model(context_vector1)
print(word_to_ix['reason'])
context2 = ['the', 'reason', 'a', 'great']
context_vector2 = make_context_vector(context2, word_to_ix)
b = model(context_vector2)

print(f'文本数据: {" ".join(text)}\n')
print(f'预测1: {context1}\n')
print(f'预测结果: {ix_to_word[torch.argmax(a[0]).item()]}') # torch.argmax(a[0])返回的是a[0]中最大值的索引, torch.argmax(a[0]).item()返回的是a[0]中最大值的索引的值
print('\n')
print(f'预测2: {context2}\n')
print(f'预测结果: {ix_to_word[torch.argmax(b[0]).item()]}')

40
文本数据: people who truly loved once are far more likely to love again difficult circumstances serve as a textbook of life for people the best preparation for tomorrow is doing your best today the reason why a great man is great is that he resolves to be a great man the shortest way to do many things is to only one thing at a time only they who fulfill their duties in everyday matters will fulfill them on great occasions i go all out to deal with the ordinary life i can stand up once again on my own never underestimate your power to change yourself

预测1: ['preparation', 'for', 'is', 'doing']

预测结果: tomorrow


预测2: ['the', 'reason', 'a', 'great']

预测结果: why
