In [1]:
import torch
import torch.utils.data as tud
import torch.nn as nn
from seq2seq import Seq2SeqEmbeddingsConcatFullTeacherForcing, Seq2Seq
import pickle
from tqdm.notebook import tqdm

In [2]:
folder = "/home/jarobyte/scratch/inaoe/nsp/"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
chars_vocab = pickle.load(open(folder + "chars.vocab", "rb"))
words_vocab = pickle.load(open(folder + "words.vocab", "rb"))

# model

In [5]:
source = torch.load(folder + "char/source_train.pt").to(device)
source.shape

torch.Size([4499, 130])

In [6]:
(source != 0).sum()

tensor(273719, device='cuda:0')

In [7]:
target = torch.load(folder + "char/target_train.pt").to(device)
target.shape

torch.Size([4499, 130])

In [8]:
(target != 0).sum()

tensor(288726, device='cuda:0')

In [9]:
dataset = tud.TensorDataset(source, target)
print(len(dataset))
train, dev = tud.random_split(dataset, [4000, 499])
print(len(train))
print(len(dev))

4499
4000
499


In [10]:
(train.dataset.tensors[1][train.indices] != 0).sum()

tensor(255828, device='cuda:0')

In [11]:
(train.dataset.tensors[1][train.indices] != 0).float().mean()

tensor(0.4920, device='cuda:0')

In [12]:
(dev.dataset.tensors[1][dev.indices] != 0).sum()

tensor(32898, device='cuda:0')

In [13]:
(dev.dataset.tensors[1][dev.indices] != 0).float().mean()

tensor(0.5071, device='cuda:0')

In [25]:
train_loader = tud.DataLoader(train, batch_size = 2000, shuffle = True)
dev_loader = tud.DataLoader(dev, batch_size = 500)
torch.cuda.empty_cache()

encoder_embedding_dim = 128
decoder_embedding_dim = 128
encoder_hidden_dim = 128
decoder_hidden_dim = 128
encoder_layers = 1
decoder_layers = 1
epochs = 2000
lr = 0.001
weight_decay = 0.00
dropout = 0.5
    
    
net = Seq2SeqEmbeddingsConcatFullTeacherForcing(chars_vocab["char2i"], chars_vocab["char2i"], 
                                                encoder_embedding_dim, decoder_embedding_dim, 
                                                encoder_hidden_dim, encoder_layers, 
                                                decoder_hidden_dim, decoder_layers,
                                                dropout)
net.to(device)

losses, train_accuracies, dev_accuracies = net.fit(train_loader, dev_loader, epochs, lr, weight_decay)

This net has 410,150 parameters.
Training process:
epoch | train loss | train accuracy | dev accuracy | non-pad train accuracy | non-pad dev accuracy


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2001.0), HTML(value='')))

    0    3.101           51.20           49.67              0.00                    0.00           
  200    0.882           73.93           72.70             46.58                   45.75           
  400    0.745           78.21           76.72             55.34                   53.75           
  600    0.682           80.33           78.38             59.70                   57.04           
  800    0.641           81.89           79.57             62.89                   59.41           
 1000    0.601           83.30           80.40             65.79                   61.06           
 1200    0.577           84.30           80.83             67.83                   61.91           
 1400    0.555           85.09           81.08             69.45                   62.42           
 1600    0.531           85.85           81.36             71.01                   62.96           
 1800    0.516           86.46           81.42             72.26                   63.08           


In [26]:
source_test = torch.load(folder + "char/source_test.pt").to(device)
source_test.shape

target_test = torch.load(folder + "char/target_test.pt").to(device)
target_test.shape

test_loader = tud.DataLoader(tud.TensorDataset(source_test, target_test), batch_size = 500)

net.compute_accuracy(test_loader)

(78.8983606557377, 61.39751087119508)

In [27]:
net.eval()
text = ["hola", "adios", "juan"] * 3
text = [torch.tensor([1] + [chars_vocab["char2i"][c] for c in s] + [2]) for s in text]
text = nn.utils.rnn.pad_sequence(text).to(device)
print(text.shape)
inference = net.inference(text)
["".join([chars_vocab["i2char"][i] for i in s]).replace("<PAD>", "").replace(" ", "-") 
 for s in inference.T.tolist()]

torch.Size([7, 9])


['<START>talles-de-la-construcción-de-las-desarrollo-de-la-sociedad-de-madrid<END>',
 '<START>además-de-la-respigunación-descubresores-descubres-descubradoras<END>',
 '<START>todo-de-la-respincultad-de-constitución-mundo-de-la-mundo<END>',
 '<START>talles-de-la-construcción-de-las-desarrollo-de-la-sociedad-de-madrid<END>',
 '<START>además-de-la-respigunación-descubresores-descubres-descubradoras<END>',
 '<START>todo-de-la-respincultad-de-constitución-mundo-de-la-mundo<END>',
 '<START>talles-de-la-construcción-de-las-desarrollo-de-la-sociedad-de-madrid<END>',
 '<START>además-de-la-respigunación-descubresores-descubres-descubradoras<END>',
 '<START>todo-de-la-respincultad-de-constitución-mundo-de-la-mundo<END>']

In [28]:
text = source[:10]
["".join([chars_vocab["i2char"][i] for i in s]).replace("<PAD>", "").replace(" ", "-") for s in text.tolist()]

['<START>in-ther-gatalalatical-of-here<END>',
 '<START>the-lasi-the-megico-pari-munbo<END>',
 '<START>isin-embargo-notehiseon-a-qestianol-importanty<END>',
 '<START>is-thi-racually-lone-o-auropea<END>',
 "<START>monten-imental-alphormas-unless-yo're-in-mexico<END>",
 '<START>certificalteteo-mas-in-casoliclosposea<END>',
 '<START>a-salways-look-as-a-le-liker<END>',
 '<START>inlamade-relus-casos-simply-imethaphoric-commant<END>',
 '<START>fonde-te-lenasionis-so-nispalinfrancia-oli-sefe<END>',
 '<START>instita-tonational-the-statistic-achegraphia-am-for-matica-in-ahe<END>']

In [29]:
text = target[:10]
["".join([chars_vocab["i2char"][i] for i in s]).replace("<PAD>", "").replace(" ", "-") for s in text.tolist()]

['<START>en-el-caso-de-la-psicología<END>',
 '<START>de-la-ciudad-de-méxico-para-el-mundo<END>',
 '<START>y-sin-embargo-no-deja-de-ser-una-cuestión-muy-importante<END>',
 '<START>el-derecho-de-la-unión-europea<END>',
 '<START>mantenimiento-de-alfombras-en-la-ciudad-de-méxico<END>',
 '<START>certificados-de-idiomas-en-caso-de-que-los-posea<END>',
 '<START>eso-es-lo-que-se-refleja<END>',
 '<START>en-la-mayoría-de-los-casos-se-emplea-metafóricamente<END>',
 '<START>fondo-de-las-naciones-unidas-para-la-infancia-unicef<END>',
 '<START>instituto-nacional-de-estadística-geografía-e-informática-inegi<END>']

In [30]:
net.eval()
text = source[:10].T
inference = net.inference(text)
["".join([chars_vocab["i2char"][i] for i in s]).replace("<PAD>", "").replace(" ", "-") 
 for s in inference.T.tolist()]

['<START>en-el-caso-de-la-calidad-de-calificación<END>',
 '<START>de-la-condición-de-méxico-de-méxico<END>',
 '<START>es-el-medio-asignar-el-estado-de-mismo-no-se-encuentra-en-este-estado<END>',
 '<START>es-la-producción-de-la-unión<END>',
 '<START>mundo-de-la-mesa-de-la-mesa-de-la-comunidad-mexicana<END>',
 '<START>características-de-los-consumos-de-persona<END>',
 '<START>es-la-sociedad-de-la-seguridad<END>',
 '<START>en-la-mesa-de-los-casos-de-la-comprema-de-manancia<END>',
 '<START>por-las-necesidades-de-la-participación-de-la-asista<END>',
 '<START>instituto-de-la-comisión-de-la-información-de-la-cambiante<END>']