In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torch.autograd import Variable
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from nltk.corpus import gutenberg
from string import punctuation
import nltk

In [2]:
torch.manual_seed(16)

<torch._C.Generator at 0x7f0c98032290>

In [3]:
class MyDataset(Dataset):
    def __init__(self, data_path, label_path):
        
        self.data_path = data_path
        self.label_path = label_path
        
        with open(self.data_path, 'r') as f:
            sents = f.read().splitlines()
        
        
        self.sents = [sent.split() for sent in sents]
        
        self.word2id = dict()
        i = 1
        for sent in self.sents:
            for word in sent:
                if word not in self.word2id:
                    self.word2id[word] = i
                    i+=1
        self.word2id['<pad>'] = 0
        self.id2word = {v:k for (k,v) in self.word2id.items()}
        
        self.vocab_size = len(self.word2id)
        
        self.sents = [[self.word2id[word] for word in sent] for sent in self.sents]
        
        self.sents = sequence.pad_sequences(self.sents, maxlen=32, padding="post")
                
        
        with open(self.label_path, 'r') as f:
            labels = f.read().splitlines()
        
        self.labels = [list(map(int,label.split())) for label in labels]
        self.labels = sequence.pad_sequences(self.labels, maxlen=32, padding="post", value=3)
        
        
    def __getitem__(self, index):
        
        return {'data':self.sents[index], 'label':self.labels[index]}
    
    def __len__(self):
        return len(self.labels)

In [5]:
traindataset = MyDataset(data_path ='./data/traindata.txt', label_path ='./data/trainlabel.txt')

In [9]:
with open('./data/traindata.txt', 'r') as f:
    traindata = f.read().splitlines()
with open('./data/trainlabel.txt', 'r') as f:
    trainlabel = f.read().splitlines()

len(traindata), len(trainlabel)

(617, 617)

In [10]:
with open('./data/testdata.txt', 'r') as f:
    testdata = f.read().splitlines()
with open('./data/testlabel.txt', 'r') as f:
    testlabel = f.read().splitlines()

len(testdata), len(testlabel)

(545, 545)

In [13]:
data = traindata + testdata
label = trainlabel + testlabel

In [14]:
len(data), len(label)

(1162, 1162)

In [21]:
with open('./data/small_traintext.txt','w') as f:
    for i in data[:1000]:
        f.write(i)
        f.write('\n')
with open('./data/small_trainlabel.txt','w') as f:
    for i in label[:1000]:
        f.write(i)
        f.write('\n')

In [22]:
with open('./data/small_validtext.txt','w') as f:
    for i in data[1000:]:
        f.write(i)
        f.write('\n')
with open('./data/small_validlabel.txt','w') as f:
    for i in label[1000:]:
        f.write(i)
        f.write('\n')

## testing with batchsize dataset

In [46]:
def dataset_batch_iter(dataset, batch_size):
    b_words = []
    b_labels = []
    for data in dataset:
        b_words.append(data['data'])
        b_labels.append(data['label'])
        
        if len(b_words) == batch_size:
            yield {'data':np.array(b_words, dtype=int), 'label':np.array(b_labels, dtype=int)}
            b_words, b_labels = [], []
    

In [47]:
for batch, data in enumerate(dataset_batch_iter(dataset, 2)):
    print(batch)
    print(data['data'])
    print(data['label'])
    break
    

0
[[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17  9 18  2 13 14 19 20
  21 22 23 24 25  0  0  0]
 [26 27  4 28 29 30 31 32 33 34 35 36 37 38 39 40 30 41 42 43 44 45 36 44
  46 45 47 48 49 50  0  0]]
[[0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 3 3 3]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 2 3 3]]


## Model

In [48]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size,
                 hidden_dim, n_layers):
        
        super(RNNModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_size = output_size
        self.embedding_size = embedding_size
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embedding_size)
        
        
        self.rnn = nn.RNN(embedding_size, hidden_dim, n_layers, batch_first=True)   
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        
        
        output, hidden = self.rnn(embedded, hidden)
        
        output = self.fc(output)
        prob = self.softmax(output)
        
        
        return prob, hidden
    
    def init_hidden(self, batch_size):
       
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [49]:
vocab_size = dataset.vocab_size
embedding_size = 16
num_categories = 4
hidden_dim = 16
n_layers = 1
learning_rate = 0.001
batch_size = 2
length = 32

In [50]:
model = RNNModel(vocab_size, embedding_size, num_categories, hidden_dim, n_layers)

In [51]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [52]:
nll_loss = nn.NLLLoss()

In [53]:
def test(model, test_dataset, batch_size = 1):
    model.eval()
    test_loss = 0.
    hidden = model.init_hidden(batch_size)
    
    for batch, data in enumerate(dataset_batch_iter(test_dataset, batch_size)):
        input_tensor = torch.Tensor(data['data']).type(torch.LongTensor)
        target_tensor = torch.Tensor(data['label']).type(torch.LongTensor)
    
        output, hidden = model(input_tensor, hidden)
        prediction = output.argmax(dim = -1)
        
        loss = nll_loss(output.view(-1, num_categories), target_tensor.view(-1))
            
        test_loss += loss.item()
        
        accuracy = torch.sum(prediction == target_tensor)
        accuracy = accuracy/(batch_size*length)
    return test_loss, accuracy

In [54]:
def train_epoch(model, train_dataset, batch_size):
    # training
    model.train()
    hidden = model.init_hidden(batch_size)
    train_loss = 0.
    for batch, data in enumerate(dataset_batch_iter(train_dataset, batch_size)):
        input_tensor = torch.Tensor(data['data']).type(torch.LongTensor)
        target_tensor = torch.Tensor(data['label']).type(torch.LongTensor)
        
        optimizer.zero_grad()
        output, hidden = model(input_tensor, hidden)
        hidden = Variable(hidden.data, requires_grad=True)

        loss = nll_loss(output.view(-1, num_categories), target_tensor.view(-1))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    return train_loss

In [55]:
def infere(model, input_tensor):
    # infering
    model.eval()
    batch_size = input_tensor.shape[0]
    hidden = model.init_hidden(batch_size)
    output, _ = model(input_tensor, hidden)
    prediction = output.argmax(dim=-1)
    return prediction

In [56]:
def train(epochs, train_dataset, test_dataset):
    
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_dataset, batch_size)
        
        if epoch % 10 == 0:
            test_loss, accuracy = test(model, test_dataset, batch_size)
            print(f"Epoch {epoch} --train loss {train_loss} -- test loss {test_loss}-- test acc {accuracy}")

In [58]:
def restore(tokens, punct):
    id2word = dataset.id2word
    convert = {0:'', 1:',', 2:'.', 3:''}
    seq = [id2word[token]+convert[punct[i]] for i, token in enumerate(tokens)]
    seq = ' '.join(seq)
    return seq

In [57]:
train(101, dataset, dataset)

Epoch 0 --train loss 993.0128664970398 -- test loss 971.1010429859161-- test acc 0.5625
Epoch 10 --train loss 921.256861448288 -- test loss 917.9510951042175-- test acc 0.75
Epoch 20 --train loss 892.2851436138153 -- test loss 889.129175901413-- test acc 0.796875
Epoch 30 --train loss 870.1857359409332 -- test loss 868.4203298091888-- test acc 0.859375
Epoch 40 --train loss 855.3676483631134 -- test loss 853.4807364940643-- test acc 0.875
Epoch 50 --train loss 845.1314046382904 -- test loss 843.9620831012726-- test acc 0.890625
Epoch 60 --train loss 837.943708896637 -- test loss 840.8105971813202-- test acc 0.921875
Epoch 70 --train loss 834.7703297138214 -- test loss 832.6245355606079-- test acc 0.90625
Epoch 80 --train loss 829.7893669605255 -- test loss 829.5500228404999-- test acc 0.96875
Epoch 90 --train loss 827.6537466049194 -- test loss 828.8166174888611-- test acc 0.984375
Epoch 100 --train loss 826.2723476886749 -- test loss 825.8633854389191-- test acc 0.984375


In [62]:
for batch, data in enumerate(dataset_batch_iter(dataset, 1)):
    data, label = data['data'], data['label']
    tokens = list(data[0])
    label = list(label[0])
    data = torch.Tensor(data).type(torch.LongTensor)
    pred = infere(model, data)
    pred = np.array(pred[0])
    print("true sent:")
    print(restore(tokens, label))
    print("--------")
    print("prediction:")
    print(restore(tokens, pred))
    break

true sent:
do đó, khi các doanh_nghiệp tiếp_cận sự cải_tiến, họ có xu_hướng đưa những người tốt nhất của họ vào đó, những người đã thể_hiện sở_trường để nhận được kết_quả. <pad> <pad> <pad>
--------
prediction:
do đó, khi các doanh_nghiệp tiếp_cận sự cải_tiến, họ có xu_hướng đưa những người tốt nhất của họ vào đó, những người đã thể_hiện sở_trường để nhận được kết_quả. <pad> <pad> <pad>
