In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torch.autograd import Variable
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from nltk.corpus import gutenberg
from string import punctuation
import nltk

In [2]:
torch.manual_seed(16)

<torch._C.Generator at 0x7ff2e00902b0>

In [14]:
class MyDataset(Dataset):
    def __init__(self, data_path, label_path):
        
        self.data_path = data_path
        self.label_path = label_path
        
        with open(self.data_path, 'r') as f:
            sents = f.read().splitlines()
        
        tokenizer = text.Tokenizer()
        tokenizer.fit_on_texts(sents)    
        self.word2id = tokenizer.word_index
        self.word2id['<pad>'] = 0
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.vocab_size = len(self.word2id)
        
        print(self.word2id)
        
        self.sents = [sent.split() for sent in sents]
        self.sents = [[self.word2id[word] for word in sent] for sent in self.sents]
        
        self.sents = sequence.pad_sequences(self.sents, maxlen=32, padding="post")
                
        
        with open(self.label_path, 'r') as f:
            labels = f.read().splitlines()
        
        self.labels = [list(map(int,label.split())) for label in labels]
        self.labels = sequence.pad_sequences(self.labels, maxlen=32, padding="post", value=3)
        
        
    def __getitem__(self, index):
        
        return {'data':self.sents[index], 'label':self.labels[index]}
    
    def __len__(self):
        return len(self.labels)

In [55]:
dataset = MyDataset(data_path ='./train.txt', label_path ='./label.txt')

{'i': 1, 'hygge': 2, 'you': 3, 'day': 4, 'like': 5, 'miss': 6, 'how': 7, 'is': 8, 'your': 9, 'good': 10, '<pad>': 0}


In [56]:
len(dataset)

4

In [57]:
dataset[0]

{'data': array([1, 5, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 'label': array([0, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)}

## testing with batchsize dataset

In [58]:
def dataset_batch_iter(dataset, batch_size):
    b_words = []
    b_labels = []
    for data in dataset:
        b_words.append(data['data'])
        b_labels.append(data['label'])
        
        if len(b_words) == batch_size:
            yield {'data':np.array(b_words, dtype=int), 'label':np.array(b_labels, dtype=int)}
            b_words, b_labels = [], []
    

In [59]:
for batch, data in enumerate(dataset_batch_iter(dataset, 2)):
    print(batch)
    print(data['data'])
    print(data['label'])
    break
    

0
[[1 5 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 6 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
 [0 0 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]]


## Model

In [60]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size,
                 hidden_dim, n_layers):
        
        super(RNNModel, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_size = output_size
        self.embedding_size = embedding_size
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                      embedding_dim=embedding_size)
        
        
        self.rnn = nn.RNN(embedding_size, hidden_dim, n_layers, batch_first=True)   
        
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        
        
        output, hidden = self.rnn(embedded, hidden)
        
        output = self.fc(output)
        prob = self.softmax(output)
        
        
        return prob, hidden
    
    def init_hidden(self, batch_size):
       
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [61]:
vocab_size = dataset.vocab_size
embedding_size = 16
num_categories = 4
hidden_dim = 16
n_layers = 1
learning_rate = 0.001
batch_size = 2
length = 32

In [62]:
model = RNNModel(vocab_size, embedding_size, num_categories, hidden_dim, n_layers)

In [63]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [64]:
nll_loss = nn.NLLLoss()

In [86]:
def test(model, test_dataset, batch_size = 1):
    model.eval()
    test_loss = 0.
    hidden = model.init_hidden(batch_size)
    
    for batch, data in enumerate(dataset_batch_iter(test_dataset, batch_size)):
        input_tensor = torch.Tensor(data['data']).type(torch.LongTensor)
        target_tensor = torch.Tensor(data['label']).type(torch.LongTensor)
    
        output, hidden = model(input_tensor, hidden)
        prediction = output.argmax(dim = -1)
        
        loss = nll_loss(output.view(-1, num_categories), target_tensor.view(-1))
            
        test_loss += loss.item()
        
        accuracy = torch.sum(prediction == target_tensor)
        accuracy = accuracy/(batch_size*length)
    return test_loss, accuracy

In [87]:
def train_epoch(model, train_dataset, batch_size):
    # training
    model.train()
    hidden = model.init_hidden(batch_size)
    train_loss = 0.
    for batch, data in enumerate(dataset_batch_iter(train_dataset, batch_size)):
        input_tensor = torch.Tensor(data['data']).type(torch.LongTensor)
        target_tensor = torch.Tensor(data['label']).type(torch.LongTensor)
        
        optimizer.zero_grad()
        output, hidden = model(input_tensor, hidden)
        hidden = Variable(hidden.data, requires_grad=True)

        loss = nll_loss(output.view(-1, num_categories), target_tensor.view(-1))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    return train_loss

In [89]:
def infere(model, input_tensor):
    # infering
    model.eval()
    batch_size = input_tensor.shape[0]
    hidden = model.init_hidden(batch_size)
    output, _ = model(input_tensor, hidden)
    prediction = output.argmax(dim=-1)
    return prediction

In [90]:
def train(epochs, train_dataset, test_dataset):
    
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_dataset, batch_size)
        
        if epoch % 10 == 0:
            test_loss, accuracy = test(model, test_dataset, batch_size)
            print(f"Epoch {epoch} --train loss {train_loss} -- test loss {test_loss}-- test acc {accuracy}")

In [91]:
train(101, dataset, dataset)

Epoch 0 --train loss 5.948636054992676 -- test loss 5.948574542999268-- test acc 0.984375
Epoch 10 --train loss 5.9481964111328125 -- test loss 5.948201894760132-- test acc 0.984375
Epoch 20 --train loss 5.948175430297852 -- test loss 5.948178052902222-- test acc 0.984375
Epoch 30 --train loss 5.948169946670532 -- test loss 5.948172092437744-- test acc 0.984375
Epoch 40 --train loss 5.948162078857422 -- test loss 5.948161363601685-- test acc 0.984375
Epoch 50 --train loss 5.948155641555786 -- test loss 5.948159217834473-- test acc 0.984375
Epoch 60 --train loss 5.948153734207153 -- test loss 5.948153018951416-- test acc 0.984375
Epoch 70 --train loss 5.9481518268585205 -- test loss 5.948151111602783-- test acc 0.984375
Epoch 80 --train loss 5.948145389556885 -- test loss 5.948149681091309-- test acc 0.984375
Epoch 90 --train loss 5.94814395904541 -- test loss 5.948143482208252-- test acc 0.984375
Epoch 100 --train loss 5.948137998580933 -- test loss 5.948137521743774-- test acc 0.98437

In [92]:
prediction = infere(model, input_tensor)

In [93]:
prediction.shape

torch.Size([2, 32])

In [94]:
print(prediction)

tensor([[1, 0, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3],
        [0, 1, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3]])


In [95]:
target_tensor

tensor([[0, 0, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3],
        [0, 1, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3]])