# 任务五：基于神经网络的语言模型

## 处理数据集

In [1]:
import torch
import os
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import collections
import os
import random
import time
import torch.nn.functional as F
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm
import torch.nn as nn

from nltk import word_tokenize
import numpy as np
import sys
sys.path.append('..')
device=torch.device('cuda'if torch.cuda.is_available() else 'cpu')
data_path=r'E:\NLP_jupyternotebook\Fudan_NLP_beginner\datasets\poetryFromTang.txt'

#### 加载数据

In [2]:
class poetry():
    def read_file(self,data_path):
        tokens=[]
        samples=[]
        with open(data_path,"r",encoding='utf8') as f:
            for line in f:
                line=line.strip('\n')
                #print(line)
                if line=='':
                    if len(tokens)!=0:
                        tokens=['<START>']+tokens+['<EOP>']
                        samples.append(tokens)
                        tokens=[]
                else:
                    content=list(line)
                    tokens=tokens+content
        samples.append(tokens)
        return samples
tangpoetry=poetry()
samples=tangpoetry.read_file(data_path)
samples=sorted(samples,key=lambda x:-len(x))

In [3]:
lengths=[len(x) for x in samples]
print('总共有:%d首诗歌，总字数为%d，每首诗歌的长度分别为(包含标点符号):\n'%(len(samples),sum(lengths)),lengths)

总共有:164首诗歌，总字数为16103，每首诗歌的长度分别为(包含标点符号):
 [1109, 894, 602, 409, 338, 323, 322, 314, 290, 290, 284, 260, 247, 226, 218, 206, 194, 171, 168, 164, 162, 162, 158, 154, 150, 146, 146, 137, 135, 135, 134, 131, 130, 130, 122, 120, 107, 106, 103, 98, 96, 95, 90, 85, 83, 82, 82, 82, 76, 75, 74, 72, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 61, 60, 58, 58, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 26, 26, 18]


观察发现，诗歌的长度差异太大了，如果用动态填充的话，就必须按照诗歌长度进行排序，这样才能保证不至于一个短的诗歌和一个长的诗歌组装成一个Batch然后短的填充几倍于自身长度的<pad>标签

In [4]:
def get_vocab(data):
    counter=collections.Counter([tk for st in data for tk in st])
    return Vocab.Vocab(counter)
vocab=get_vocab(samples)
print("总字典长度",len(vocab))

总字典长度 2518


In [5]:
print(vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001B5EFCC5648>>, {'<unk>': 0, '<pad>': 1, '，': 2, '。': 3, '<EOP>': 4, '<START>': 5, '不': 6, '人': 7, '有': 8, '何': 9, '天': 10, '无': 11, '来': 12, '云': 13, '风': 14, '山': 15, '日': 16, '生': 17, '一': 18, '君': 19, '知': 20, '如': 21, '时': 22, '为': 23, '子': 24, '长': 25, '青': 26, '中': 27, '白': 28, '未': 29, '水': 30, '自': 31, '高': 32, '上': 33, '春': 34, '见': 35, '花': 36, '江': 37, '下': 38, '清': 39, '马': 40, '万': 41, '之': 42, '今': 43, '可': 44, '我': 45, '相': 46, '此': 47, '千': 48, '门': 49, '与': 50, '已': 51, '开': 52, '月': 53, '王': 54, '空': 55, '金': 56, '东': 57, '亦': 58, '心': 59, '里': 60, '复': 61, '是': 62, '歌': 63, '深': 64, '年': 65, '林': 66, '流': 67, '海': 68, '秋': 69, '行': 70, '酒': 71, '雨': 72, '主': 73, '南': 74, '去': 75, '同': 76, '明': 77, '在': 78, '寒': 79, '归': 80, '闻': 81, '飞': 82, '龙': 83, '事': 84, '出': 85, '前': 86, '夜': 87, '所': 88, '谁': 89, '道': 90, '以': 91, '公': 92, '大': 93, '思': 94, '落': 95, '黄': 96, '多': 97, 

#### 创建数据迭代器

In [6]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader  
from torch.nn.utils.rnn import pad_sequence
class MyData(Dataset):
    def __init__(self,data,vocab):
        self.data=data
        self.vocab=vocab
        self.x=[[vocab.stoi[w] if w in vocab.stoi else vocab.stoi['<unk>'] for w in item[:-1]]for item in data]
        self.y=[[vocab.stoi[w] if w in vocab.stoi else vocab.stoi['<unk>'] for w in item[1:]]for item in data]
    def __getitem__(self,item):
        return torch.Tensor(self.x[item]),torch.Tensor(self.y[item])
    def __len__(self):
        return len(self.data)
def idx_to_char(data,vocab):
    data=data.numpy().tolist()
    return[vocab.itos[int(idx)]for idx in data]

In [7]:
tangshi_set=MyData(samples,vocab)
print(tangshi_set[0][0],'\n',tangshi_set[0][1])

tensor([ 5., 57., 15.,  ..., 33., 70.,  3.]) 
 tensor([ 57.,  15., 151.,  ...,  70.,   3.,   4.])


In [8]:
print(idx_to_char(tangshi_set[0][0],vocab),'\n',idx_to_char(tangshi_set[0][1],vocab))

['<START>', '东', '山', '气', '鸿', '濛', '，', '宫', '殿', '居', '上', '头', '。', '君', '来', '必', '十', '月', '，', '树', '羽', '临', '九', '州', '。', '阴', '火', '煮', '玉', '泉', '，', '喷', '薄', '涨', '岩', '幽', '。', '有', '时', '浴', '赤', '日', '，', '光', '抱', '空', '中', '楼', '。', '阆', '风', '入', '辙', '迹', '，', '旷', '原', '延', '冥', '搜', '。', '沸', '天', '万', '乘', '动', '，', '观', '水', '百', '丈', '湫', '。', '幽', '灵', '斯', '可', '佳', '，', '王', '命', '官', '属', '休', '。', '初', '闻', '龙', '用', '壮', '，', '擘', '石', '摧', '林', '丘', '。', '中', '夜', '窟', '宅', '改', '，', '移', '因', '风', '雨', '秋', '。', '倒', '悬', '瑶', '池', '影', '，', '屈', '注', '苍', '江', '流', '。', '味', '如', '甘', '露', '浆', '，', '挥', '弄', '滑', '且', '柔', '。', '翠', '旗', '澹', '偃', '蹇', '，', '云', '车', '纷', '少', '留', '。', '箫', '鼓', '荡', '四', '溟', '，', '异', '香', '泱', '漭', '浮', '。', '鲛', '人', '献', '微', '绡', '，', '曾', '祝', '沈', '豪', '牛', '。', '百', '祥', '奔', '盛', '明', '，', '古', '先', '莫', '能', '俦', '。', '坡', '陀', '金', '虾', '蟆', '，', '出', '见', '盖', '有', '由', '。', '至', '尊', '顾', '之', '笑', '，'

In [9]:
def collate_fn(batch):
    x=[x[0]for x in batch]
    y=[x[1]for x in batch]
    lengths=[len(item[0])for item in batch]
    x=pad_sequence(x,padding_value=vocab.stoi['<pad>'])
    y=pad_sequence(y,padding_value=vocab.stoi['<pad>'])
    #print(type(x),type(y))
    return x.transpose(0,1),y.transpose(0,1),lengths

In [10]:
train_size=int(len(tangshi_set)*0.9)
test_size=len(tangshi_set)-train_size
train_set, test_set = torch.utils.data.random_split(tangshi_set, [train_size, test_size])
train_set=sorted(train_set,key=lambda x:-len(x[0]))
test_set=sorted(test_set,key=lambda x:-len(x[0]))
train_iter=DataLoader(dataset=train_set,batch_size=2,collate_fn=collate_fn)
test_iter=DataLoader(dataset=test_set,batch_size=2,collate_fn=collate_fn)
print([len(x[0])for x in train_set],[len(x[0])for x in test_set])

[893, 408, 337, 322, 321, 313, 289, 289, 283, 259, 246, 225, 217, 205, 193, 170, 163, 161, 157, 153, 149, 145, 145, 136, 134, 134, 130, 129, 121, 119, 106, 105, 102, 97, 95, 89, 84, 82, 81, 81, 75, 74, 73, 71, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 60, 59, 57, 57, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 17] [1108, 601, 167, 161, 133, 129, 94, 81, 65, 49, 49, 49, 49, 33, 33, 25, 25]


In [11]:
print('-'*60,'测试','-'*60)
for x,y,length in train_iter:
    #print('x shape',x.shape,'y shape',y.shape,length)
    #print(idx_to_char(x[0],vocab))
    #print(idx_to_char(y[0],vocab))
    #print(idx_to_char(x[1],vocab))
    #print(idx_to_char(y[1],vocab))
    #print(idx_to_char(x[3],vocab),x.shape,y.shape,length)
    print(length)
    break
    


------------------------------------------------------------ 测试 ------------------------------------------------------------
[893, 408]


In [12]:
print('-'*60,'测试','-'*60)
for x,y,length in test_iter:
    print('x shape',x.shape,'y shape',y.shape,length)
    print(idx_to_char(x[0],vocab))
    #print(idx_to_char(y[0],vocab))
    print(idx_to_char(x[1],vocab))
    #print(idx_to_char(y[1],vocab))
    break

------------------------------------------------------------ 测试 ------------------------------------------------------------
x shape torch.Size([2, 1108]) y shape torch.Size([2, 1108]) [1108, 601]
['<START>', '东', '山', '气', '鸿', '濛', '，', '宫', '殿', '居', '上', '头', '。', '君', '来', '必', '十', '月', '，', '树', '羽', '临', '九', '州', '。', '阴', '火', '煮', '玉', '泉', '，', '喷', '薄', '涨', '岩', '幽', '。', '有', '时', '浴', '赤', '日', '，', '光', '抱', '空', '中', '楼', '。', '阆', '风', '入', '辙', '迹', '，', '旷', '原', '延', '冥', '搜', '。', '沸', '天', '万', '乘', '动', '，', '观', '水', '百', '丈', '湫', '。', '幽', '灵', '斯', '可', '佳', '，', '王', '命', '官', '属', '休', '。', '初', '闻', '龙', '用', '壮', '，', '擘', '石', '摧', '林', '丘', '。', '中', '夜', '窟', '宅', '改', '，', '移', '因', '风', '雨', '秋', '。', '倒', '悬', '瑶', '池', '影', '，', '屈', '注', '苍', '江', '流', '。', '味', '如', '甘', '露', '浆', '，', '挥', '弄', '滑', '且', '柔', '。', '翠', '旗', '澹', '偃', '蹇', '，', '云', '车', '纷', '少', '留', '。', '箫', '鼓', '荡', '四', '溟', '，', '异', '香', '泱', '漭', '浮', '。', '鲛', '人', '

## Model&Trainning

In [13]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from tqdm import tqdm
import math

In [105]:
from torch.nn import init
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
class Poetry(nn.Module):
    def __init__(self,vocab_size,input_dim,hidden_dim,device,layer_num):
        super(Poetry,self).__init__()
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim
        self.device=device
        self.layer_num=layer_num
        self.embeddings=nn.Embedding(vocab_size,self.input_dim)
        self.lstm=nn.LSTM(input_dim,hidden_dim,num_layers=self.layer_num)
        self.linear=nn.Linear(hidden_dim,vocab_size)
        self.init_weights()
    def init_weights(self):
        initrange=0.1
        self.embeddings.weight.data.uniform_(-initrange,initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange,initrange)
    def forward(self,inputs):
        #inputs [batch,seq_len]
        batch,seq_len=inputs.shape
        inputs=inputs.long().transpose(1, 0).contiguous() #[seq_len,batch]
        embeds=self.embeddings(inputs)
        #print(embeds.shape)\
        """
        if batch>1:
            packed_embed=pack_padded_sequence(input=embeds,lengths=lengths)
            packed_output,hidden=self.lstm(packed_embed,hidden) 
            output,_=pad_packed_sequence(packed_output,padding_value=vocab.stoi['<pad>'])  #[seq_len,batch,hidden_dim]
        else:
            output,_=self.lstm(inputs)
        """
        output,_=self.lstm(embeds)
        #print('lstm output shape',output.shape)
        #print('output shape',output.shape)
        output=F.relu(self.linear(output.view(seq_len*batch,-1)))  #[seq_len*batch,vocab_size]
        return output

In [110]:
input_dim,hidden_dim,layer_num,weight_decay,lr,batch_size,num_epochs=128,128,2,1e-4,1e-4,4,250
train_iter=DataLoader(dataset=train_set,batch_size=batch_size,collate_fn=collate_fn)
test_iter=DataLoader(dataset=test_set,batch_size=batch_size,collate_fn=collate_fn)
#input_dim=256,hidden_dim=256,device=device,layer_num=2,lr=0.01,epoch_num=10,weight_dacay=1e-4
model=Poetry(len(vocab),input_dim=input_dim,hidden_dim=hidden_dim,device=device,layer_num=layer_num)
loss=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

In [111]:
def train(data_iter,test_iter,model,optimizer,loss,vocab,num_epochs):
    model=model.to(device)
    print('train on ',device)
    #hidden=model.init_hidden(2,2)
    for epoch in range(num_epochs):
        model.train()
        l_sum,n,train_acc_sum,start=0.0,0,0.0,time.time()
        for X,y,length in train_iter:
            #print(X,X.shape)
            X=X.to(device) #[batch,seq_len]
            y=y.to(device) #[batch,seq_len]
            X=X.long()
            y=y.long()
            optimizer.zero_grad()
            y=y.transpose(1,0).contiguous().view(-1)    #[seq_len*batch]
            output=model(X) #[seq_len*batch,vocab_size]
            #print('output shape',output.shape,'y shape',y.shape)
            l=loss(output,y)
            l.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)
            optimizer.step()
            l_sum+=l
        if (epoch+1)%20==0:
            test_loss=evaluate(model,test_iter)
            try:
                perplexity=math.exp(test_loss)
            except OverflowError:
                perplexity=float('inf')
            print('epoch %d,perplexity %f,l_sum %.2f,time %.2f sec'%(epoch+1,perplexity,l_sum,time.time()-start))


In [112]:
def evaluate(model,data_iter):
    model.eval()
    total_loss=0.
    total_count=0.
    loss=nn.CrossEntropyLoss()
    with torch.no_grad():
        for data,target,length in data_iter:
            data=data.long()
            target=target.long()
            data=data.to(device)    #[batch,seq_len]
            target=target.to(device)#[batch,seq_len]
            target=target.transpose(1,0).contiguous().view(-1)
            with torch.no_grad():
                output=model(data)#[seq_len*batch,vocab_size]
            l_test=loss(output,target)
            total_count+=np.multiply(*data.size())
            total_loss+=l_test.item()*np.multiply(*data.size())
    loss=total_loss/total_count
    return loss

In [113]:
train(train_iter,test_iter,model,optimizer,loss,vocab,num_epochs)

train on  cuda
epoch 20,perplexity 207.841626,l_sum 249.42,time 0.66 sec
epoch 40,perplexity 176.577606,l_sum 248.62,time 0.60 sec
epoch 60,perplexity 166.013004,l_sum 247.53,time 0.59 sec
epoch 80,perplexity 139.521936,l_sum 246.33,time 0.61 sec
epoch 100,perplexity 71.336388,l_sum 241.16,time 0.60 sec
epoch 120,perplexity 50.486976,l_sum 231.45,time 0.59 sec
epoch 140,perplexity 42.754248,l_sum 226.40,time 0.60 sec
epoch 160,perplexity 39.120439,l_sum 224.83,time 0.62 sec
epoch 180,perplexity 39.448221,l_sum 221.93,time 0.59 sec
epoch 200,perplexity 40.085843,l_sum 219.95,time 0.59 sec
epoch 220,perplexity 41.719248,l_sum 217.83,time 0.60 sec
epoch 240,perplexity 42.245241,l_sum 214.94,time 0.60 sec


在180epoch后困惑度开始上升，训练集的loss依然在下降，推测原因是训练集过拟合，由于训练集过小，导致训练次数不能够大，也有没有使用dropout层的原因