In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("./drive/My Drive/workspaces")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
# 使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
EOS_token = 1

In [3]:
# 主要用于储存单词与id的映射
class Vocabulary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {0: "<SOS>", 1: "<EOS>", -1: "<unk>"}
        self.idx = 2 # Count SOS and EOS

    # 记录word和id之间的映射
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
            
    # 将句子进行分词，添加每个单词与id的映射
    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)
    
    # 得到某个单词的id
    def __call__(self, word):
        if not word in self.word2idx:
            return -1
        return self.word2idx[word]
    
    # vaocabulary的容量
    def __len__(self):
        return self.idx

In [4]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        # to do
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.maxlen = 10
        #output_size就是输出语言的所有单次的数量，hidden_size就是GRU网络隐藏层的节点数
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        #Linear全连接层，将gru隐藏层与输出层做关联，输入的目标语言单次个数为output_size
        self.out = nn.Linear(hidden_size, output_size)
        #通过Logsoftmax函数，就把最大值的索引作为生成目标单次的ID
        self.softmax = nn.LogSoftmax(dim=1)
    #输入：当前的seq_input, 以及上一时刻的hidden
    #输出：对应语言的output id，当前时刻的hidden
    def forward(self, seq_input, hidden):
        # to do
        #输入一个单词id， Embedding将它转化为词向量（hidden_size）
        output = self.embedding(seq_input).view(1,1,-1)
        output= F.relu(output)
        output,hidden = self.gru(output, hidden)
        #print(output[0])
        #通过全连接层 + softmax层， 找到对应的输出词的ID
        output = self.softmax(self.out(output[0]))
        return output, hidden

    # pre_hidden即公式中所谓的固定C向量
    def sample(self, pre_hidden):
        # to do
        #定义input
        inputs = torch.tensor([SOS_token],device=device)
        hidden = pre_hidden
        #输出结果记录res
        res = [SOS_token]
        #循环编码
        for i in range(self.maxlen):
          #self = self.forword
          #输入： 当前的输入和上一时刻隐藏层状态
          #输出： 对应语言的output，当前时刻hidden
          output,hidden = self(inputs, hidden)
          #获取最大值的索引作为生成单词的id
          topv, topi = output.topk(1)
          #判断是否为结束符
          if topi.item() == EOS_token:
            res.append(EOS_token)
            break
          else:
            res.append(topi.item())
          #将生成的topi 作为下一时刻的输入
          inputs = topi.squeeze().detach()
        return res

        


In [5]:
class EncoderRNN(nn.Module):
    # 在构造函数内定义了一个Embedding层和一GRU层，
    def __init__(self, input_size, hidden_size):
        # to do
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        #Enbedding层定义传入了一个input_size和一个hidden_size
        #input_size输入语言的所有单词个数
        #hidden_size就是LSMT/GRU网络隐藏层的节点数
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    # 前向传播
    def forward(self, input, hidden):
        # to do
        #输入一个单词id，Embedding将它转为词向量（hidden_size）
        #pytorch中的GRU输入（seq_len,batch,input_size）
        embedded = self.embedding(input).view(1,1,self.hidden_size)
        #将enbedding向量作为GRU的输入，最终得到一个输出和隐藏层的状态
        output, hidden = self.gru(embedded, hidden)
        #在这个简单的seq2seq中，encoder编码器只需要输入隐藏层即可
        return hidden
    
    # 最终执行函数
    def sample(self,seq_list):
        # to do
        word_inds = torch.LongTensor(seq_list).to(device)
        #得到初始化h0
        h =self.initHidden()
        for word_tensor in word_inds:
          #前向传播需要两个参数，一个是输入，一个是前一时刻的hidden
          h = self.forward(word_tensor, h)
        return h

    # 初始化第一层的h0，随机生成一个
    def initHidden(self):
        # to do
        return torch.zeros(1,1,self.hidden_size, device=device)



In [34]:
# 处理句子，将句子转换成Tensor
def sentence2tensor(lang, sentence):
    indexes = [lang(word) for word in sentence.split()]
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# 将(input, target)的pair都转换成Tensor
def pair2tensor(pair):
    input_tensor = sentence2tensor(lan1, pair[0])
    target_tensor = sentence2tensor(lan2, pair[1])
    return (input_tensor, target_tensor)
# 定义句子和Vocabulary类
lan1 = Vocabulary() #英文
lan2 = Vocabulary() #中文
'''
data = [['Hi .', '嗨 。'],
        ['Hi .', '你 好 。'],
        ['Run .', '跑'],
        ['Wait !', '等等 ！'],
        ['Hello !', '你好 。'],
        ['I try .', '让 我 来 。'],
        ['I won !', '我 赢 了 。'],
        ['I am OK .', '我 沒事 。']]
'''
data= []
import jieba
import re

with open('cmn.txt', 'r') as file_to_read:
  while True:
    lines = file_to_read.readline() # 整行读取数据
    lan = []
    if not lines:
      break
      pass
    lines = lines.replace('.',' .').replace('?',' ?').replace('!', ' !')
    #lan = re.split('[.?!。？！]', lines)
    split_list = re.split('[\t]', lines)
    lan.append(str(split_list[0]))
    lan.append(' '.join(jieba.lcut(split_list[1])))
    data.append(lan)

print(data[:200])

#data = data[17000:23000]

for i,j in data:
    lan1.add_sentence(i)
    lan2.add_sentence(j)
print(len(lan1))
print(len(lan2))

[['Hi .', '嗨 。'], ['Hi .', '你好 。'], ['Run .', '你 用 跑 的 。'], ['Wait !', '等等 ！'], ['Wait !', '等 一下 ！'], ['Hello !', '你好 。'], ['I try .', '让 我 来 。'], ['I won !', '我 赢 了 。'], ['Oh no !', '不会 吧 。'], ['Cheers !', '乾杯   !'], ['Got it ?', '你 懂 了 吗 ？'], ['He ran .', '他 跑 了 。'], ['Hop in .', '跳进来 。'], ['I quit .', '我 退出 。'], ["I'm OK .", '我 沒事 。'], ['Listen .', '听 着 。'], ['No way !', '不 可能 ！'], ['No way !', '没门 ！'], ['Really ?', '你 确定 ？'], ['Try it .', '试试 吧 。'], ['We try .', '我们 来 试试 。'], ['Why me ?', '为什么 是 我 ？'], ['Ask Tom .', '去 问 汤姆 。'], ['Awesome !', '好棒 ！'], ['Be calm .', '冷静 点 。'], ['Be fair .', '公平 点 。'], ['Be kind .', '友善 点 。'], ['Be nice .', '和 气点 。'], ['Be nice .', '友善 点 。'], ['Call me .', '联系 我 。'], ['Call us .', '联系 我们 。'], ['Come in .', '进来 。'], ['Get Tom .', '找到 汤姆 。'], ['Get out !', '滾 出去 ！'], ['Get out !', '出去 ！'], ['Go away !', '走開 ！'], ['Go away !', '滾 ！'], ['Go away .', '走開 ！'], ['Go home .', '回家吧 。'], ['Goodbye !', '再见 ！'], ['Goodbye !', '告辞 ！'], ['Hang on !', '坚持 。'], ['Ha

In [46]:
# 定义Encoder和Decoder以及训练的一些参数
import random
learning_rate = 0.001
hidden_size = 512

# 将Encoder, Decoder放到GPU
encoder = EncoderRNN(len(lan1), hidden_size).to(device)
decoder = DecoderRNN(hidden_size, len(lan2)).to(device)
# 网络参数 = Encoder参数 + Decoder参数
params = list(encoder.parameters()) + list(decoder.parameters())
# 定义优化器
optimizer = optim.Adam(params, lr=learning_rate)
loss = 0
# NLLLoss = Negative Log Likelihood Loss
criterion = nn.NLLLoss()
# 一共训练多次轮
turns = 10000
print_every = 200
print_loss_total = 0
# 将数据random choice，然后转换成 Tensor
#training_pairs = [pair2tensor(random.choice(data)) for pair in range(turns)]
training_pairs = [pair2tensor(random.choice(data)) for pair in range(int(len(data)*0.9))]

#print(training_pairs)

# 训练过程
for turn in range(turns):
    optimizer.zero_grad()
    loss = 0
    
    x, y = training_pairs[turn]
    input_length = x.size(0)
    target_length = y.size(0)
    # 初始化Encoder中的h0
    h = encoder.initHidden()
    # 对input进行Encoder
    for i in range(input_length):
        h = encoder(x[i],h)
    # Decoder的一个input <sos>
    decoder_input = torch.LongTensor([SOS_token]).to(device)
    
    for i in range(target_length):
        decoder_output, h = decoder(decoder_input, h)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        #计算loss 预测的decoder_output， 真实值y[i]
        loss += criterion(decoder_output, y[i])
        if decoder_input.item() == EOS_token:break
                
    print_loss_total += loss.item()/target_length
    if (turn+1) % print_every == 0 :
        print("loss:{loss:,.4f}".format(loss=print_loss_total/print_every))
        print_loss_total = 0
        
    loss.backward()
    optimizer.step()



loss:5.3036
loss:5.2826
loss:5.2434
loss:5.0593
loss:5.0534
loss:5.0207
loss:4.9904
loss:4.9080
loss:5.0489
loss:4.7266
loss:4.8838
loss:4.8984
loss:4.8424
loss:4.7008
loss:4.9527
loss:4.8370
loss:4.9296
loss:4.6780
loss:4.7004
loss:4.8277
loss:4.7595
loss:4.7596
loss:4.7172
loss:4.5225
loss:4.8812
loss:4.9112
loss:4.7676
loss:4.6990
loss:4.7316
loss:4.2663
loss:4.5004
loss:4.7606
loss:4.5055
loss:4.5443
loss:4.4663
loss:4.4303
loss:4.4698
loss:4.6098
loss:4.3028
loss:4.5600
loss:4.3028
loss:4.5433
loss:4.4860
loss:4.3472
loss:4.4569
loss:4.3465
loss:4.1756
loss:4.7599
loss:4.4549
loss:4.3583


In [59]:
# 测试函数
def translate(s):
    t = [lan1(i) for i in s.split()]
    t.append(EOS_token)
    print(t)
    f = encoder.sample(t)   # 编码 
    s = decoder.sample(f)   # 解码
    r = [lan2.idx2word[i] for i in s]    # 根据id得到单词
    return ' '.join(r) # 生成句子
print(translate('Where is my dad ?'))
print(translate('That\'s very handy'))
print(translate('This desk is mine .'))
print(translate('I can .'))


[288, 229, 354, 1034, 16, 1]
<SOS> 我 哪裡 哪裡 哪裡 的 的 ？ <EOS>
[372, 537, 1384, 1]
<SOS> 这 不是 。 。 <EOS>
[378, 1397, 229, 374, 3, 1]
<SOS> 这 是 是 的 。 <EOS>
[8, 239, 3, 1]
<SOS> 我 可以 。 。 <EOS>


从结果上看我觉得seq2seq效果并不理想，是不是可以理解为如果加上attention等机制之后，效果会有所提升？