In [2]:
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import nltk
nltk.download('cess_cat')

[nltk_data] Downloading package cess_cat to /root/nltk_data...
[nltk_data]   Unzipping corpora/cess_cat.zip.


True

In [3]:
from nltk.corpus import cess_cat as corpus

words = []
words_to_remove = ['*0*', '-Fpa-', '-Fpt-']
#for s in tqdm(corpus.sents()[:1000]): # debug or quickly train the network
for s in tqdm(corpus.sents()):
    new_s = ['<s>'] + s[:-1] + ['</s>']
    new_s = [w for w in new_s if w not in words_to_remove]
    words.extend(new_s)


100%|██████████| 17104/17104 [00:13<00:00, 1230.79it/s]


In [4]:
from torch.utils.data import Dataset, DataLoader

class FixedWindow(Dataset):
    def __init__(self, words, length_window):
        self.length_window = length_window
        vocabulary = list(set(words))
        self.word2id = {w: i for i, w in enumerate(vocabulary)}
        self.id2word = {i: w for i, w in enumerate(vocabulary)}
        self.vocabulary_size = len(vocabulary)
        self.ids = [self.word2id[w] for w in words]

    def __len__(self):
        return len(self.ids) - self.length_window

    def __getitem__(self, idx):
        first_ids = self.ids[idx : idx+self.length_window-1]
        last_id = self.ids[idx+self.length_window-1]
        return torch.tensor(first_ids), torch.tensor(last_id)


length_window = 5
dataset = FixedWindow(words, length_window)

x, y = dataset.__getitem__(10)
#print('x = {}, y = {}'.format(x, y))

batch_size = 1000 # 5 to debug
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # shuffle=False to debug

if True:
    for nbatch, (X, y) in enumerate(dataloader):
        print('batch {}'.format(nbatch))
        print('X = {}'.format(X))
        print('y = {}'.format(y))
        for x,z in zip(X.numpy(), y.numpy()):
            print([dataset.id2word[w] for w in x], end=' ')
            print(dataset.id2word[z])
        if nbatch==3:
            break


batch 0
X = tensor([[18263, 37255, 32300,  4300],
        [29100, 21007, 17871, 27653],
        [30793, 35077, 30399, 28282],
        ...,
        [ 4300, 12803,  2917,  8856],
        [35801, 27653, 23583, 38578],
        [ 4300, 22223, 37244, 21007]])
y = tensor([11592, 23583, 32300, 26225, 30049, 25849, 22419, 32300,  5147, 31450,
        36566,  2360, 11592, 38765, 32300, 32300,  2964, 10251, 23452, 21573,
        25849, 23583, 38925, 27653, 33125, 21007,   770, 25849,  4300,  5147,
        24943, 21007, 21007, 21007,  1802,  7152,  1179,  7912, 23583,  9884,
        27653, 30801, 12495, 10251, 21808, 30323, 11568, 25237, 25849, 23747,
        26421, 21663,  2636, 17028, 17355, 23583, 17473, 23583, 30821, 16052,
        12456, 21502,  4619,   770, 21808, 32300,  6577, 16385, 19056, 27653,
        27108, 25983, 30926, 32066, 32556, 29846, 13290, 30021, 18274, 38404,
        23583, 22937,  7300, 11102, 35918,  2917, 27653, 29405, 11028, 16369,
         1482, 24781, 25849, 16052,   77

In [11]:
class NNLM(nn.Module):
    def __init__(self, num_classes, dim_input, dim_hidden, dim_embedding):
        super(NNLM, self).__init__()
        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        self.dim_embedding = dim_embedding
        self.embeddings = nn.Embedding(self.num_classes, self.dim_embedding) # embedding layer or look up table
        self.hidden1 = nn.Linear(self.dim_input * self.dim_embedding, self.dim_hidden, bias=False)
        self.ones = nn.Parameter(torch.ones(self.dim_hidden))       
        self.hidden2 = nn.Linear(self.dim_hidden, self.num_classes, bias=False)
        self.hidden3 = nn.Linear(self.dim_input * self.dim_embedding, self.num_classes, bias=False) # final layer
        self.bias = nn.Parameter(torch.ones(self.num_classes))

    def forward(self, X):
        word_embeds = self.embeddings(X)
        X = word_embeds.view(-1, self.dim_input * self.dim_embedding) # first layer
        tanh = torch.tanh(self.ones + self.hidden1(X)) # tanh layer
        output = self.bias + self.hidden3(X) + self.hidden2(tanh) # summing up all the layers with bias
        return output


num_classes = dataset.vocabulary_size
dim_input = length_window - 1
dim_hidden = 50
dim_embedding = 32
learning_rate= 1e-3
num_epochs = 60

model = NNLM(num_classes, dim_input, dim_hidden, dim_embedding)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

path = 'NNLM.pt'
do_train = True
do_test = True

In [12]:
# In the top menu go to Runtime -> Change runtime type and set Hardware 
# accelerator to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
model = model.to(device)

cuda:0


In [13]:
from torch.cuda.random import device_count
if do_train:
    size = len(dataloader.dataset)
    for epoch in range(num_epochs):
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * batch_size
                print('Epoch {} loss: {:>7f}  [{:>5d}/{:>5d}]'
                    .format(epoch+1, loss, current, size))

    torch.save({'model_state_dict': model.state_dict()}, path)
else:
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

Epoch 1 loss: 10.863346  [    0/509115]
Epoch 1 loss: 7.100326  [100000/509115]
Epoch 1 loss: 6.709040  [200000/509115]
Epoch 1 loss: 6.421080  [300000/509115]
Epoch 1 loss: 6.210479  [400000/509115]
Epoch 1 loss: 6.266020  [500000/509115]
Epoch 2 loss: 5.700747  [    0/509115]
Epoch 2 loss: 5.724219  [100000/509115]
Epoch 2 loss: 5.642992  [200000/509115]
Epoch 2 loss: 5.635747  [300000/509115]
Epoch 2 loss: 5.818430  [400000/509115]
Epoch 2 loss: 5.393681  [500000/509115]
Epoch 3 loss: 5.005529  [    0/509115]
Epoch 3 loss: 5.022326  [100000/509115]
Epoch 3 loss: 5.185536  [200000/509115]
Epoch 3 loss: 5.042937  [300000/509115]
Epoch 3 loss: 5.038917  [400000/509115]
Epoch 3 loss: 4.947414  [500000/509115]
Epoch 4 loss: 4.616914  [    0/509115]
Epoch 4 loss: 4.566894  [100000/509115]
Epoch 4 loss: 4.880812  [200000/509115]
Epoch 4 loss: 4.810590  [300000/509115]
Epoch 4 loss: 4.743593  [400000/509115]
Epoch 4 loss: 4.783231  [500000/509115]
Epoch 5 loss: 4.373054  [    0/509115]
Epoc

In [15]:
if do_test:
    num_sentences = 5
    max_num_words = 100

    nsent = 0
    generated_words = ['<s>', 'El', 'dia', 'que']
    assert len(generated_words)==dim_input # length_window-1

    model.eval()
    with torch.no_grad():
        while (nsent < num_sentences) and (len(generated_words) < max_num_words):
            input_ids = [dataset.word2id[w] for w in generated_words[-dim_input:]]
            pred = model(torch.tensor(input_ids).unsqueeze(0).to(device))
            
            #output_id = torch.argmax(pred)

            probs = torch.nn.functional.softmax(pred, dim=1)
            output_id = torch.multinomial(probs,1)
            output_word = dataset.id2word[output_id.item()]
            generated_words += [output_word]
            if output_word=='</s>':
                nsent += 1

    generated_text = ' '.join(generated_words)
    generated_text = generated_text.replace(' </s> <s>', '.').replace('<s> ','').replace(' </s>','.')
    for s in [' l\' ',' s\' ',' d\' ',]:
        generated_text = generated_text.replace(s, s[:-1])
    generated_text = generated_text.replace(' , ', ', ').replace('_',' ')
    print(generated_text)

El dia que cada dia se porta el judici per facilitar la participació ", va declarar ahir en l'extinció sinó que es va fer fugir del cotxe el 99. ETA qüestions continua entrevistes a l'antic principal indemnització que jo podem defensar tot la seva obra, ell ha reconegut que amb el CPC " compartim la seva voluntat de dialogar amb els futbolistes. Aquests Lleida, que s'aniran convocant alternativament a l'art de Catalunya, com ara la que presentava esforç en horari del cementiri dels conductors - : " La dirigir es basa
