In [36]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果可以复现，我们经常会吧各种random seed固定在某个值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000



In [37]:
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path='.',
        train="text8.train.txt", validation="text8.dev.txt",
        test="text8.test.txt", text_field=TEXT)


In [38]:
device = torch.device("cuda" if USE_CUDA else "cpu")

In [39]:
TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)

In [40]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=device,bptt_len=50,
    repeat=False, shuffle=True)

In [41]:
it = iter(train_iter)
batch = next(it)

In [42]:
batch


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]

In [43]:
print(' '.join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print()
print(' '.join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))


anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the

originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


In [44]:
for i in range(5):
    batch = next(it)
    print(i)
    print(' '.join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
    print()
    print(' '.join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))
    

0
organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing

of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations
1
interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or <unk> but rather a harmonious anti authoritarian society in place of what are regarded

of what this means anarchism also refers to rela

In [75]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        
    def forward(self, text, hidden):
        # forward pass
        # text:seq_length * batch_size
        emb = self.embed(text)
        output, hidden = self.lstm(emb,hidden)
        # output:seq_length * batch_size * hidden_size
        # hidden: (1*batch_size * hidden_size, 1 * batch_size * hideen_size)
        out_vocab = self.linear(output.view(-1, output.shape[2]))
        out_vocab = out_vocab.view(output.size(0),output.size(1),  out_vocab.size(-1))
        return out_vocab, hidden
    
    def init_hidden(self,bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1,bsz, self.hidden_size), requires_grad=True),
                 weight.new_zeros((1,bsz, self.hidden_size), requires_grad=True))

In [76]:
model = RNNModel(vocab_size=len(TEXT.vocab),embed_size=EMBEDDING_SIZE,hidden_size=HIDDEN_SIZE)

if USE_CUDA:
    model = model.to(device)



In [77]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [85]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [79]:
VOCAB_SIZE = len(TEXT.vocab)

In [86]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden) # backpropgate through all iter

            loss = loss_fn(output.view(-1,VOCAB_SIZE), target.view(-1)) # batch_size * target_class_dim,batch_size
            total_loss = loss.item() * np.multiply(*data.size())
            total_count = np.multiply(*data.size())

    loss = total_loss /total_count
    model.train()
    
    return loss

In [None]:
NUM_EPOCHS = 2
GRAD_CLIP = 5.0
val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden) # backpropgate through all iter
        
        loss = loss_fn(output.view(-1,VOCAB_SIZE), target.view(-1)) # batch_size * target_class_dim,batch_size
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        
        if i % 100 == 0:
            print("loss",loss.item())
        if i % 10000 == 0:
            val_loss = evaluate(model, val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                torch.save(model.state_dict(),'lm.pth')
                print("best model saved to lm.pth")
            else:
                # learning_rate decay
                scheduler.step()
                
            val_losses.append(val_loss)

loss 5.4609575271606445
best model saved to lm.pth
loss 5.506231307983398
loss 5.3612494468688965
loss 5.777894973754883
