In [1]:
import torch
from tqdm.notebook import tqdm
from pprint import pprint
import importlib
import sys
sys.path.append("../../")
import seq2seq
import re
from nltk.lm import Vocabulary

In [13]:
# source

with open("wmt14-de-en.src", encoding = "utf-8") as file:
    source = file.readlines()
print("source lines", len(source))

source_tokens = [re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text) for text in source]

source_vocabulary = Vocabulary(sum(source_tokens, start = []))
print("source_vocabulary", len(source_vocabulary))

source_tokens = [source_vocabulary.lookup(l) for l in source_tokens]

print(source_tokens[:5])

source lines 3003
source_vocabulary 13281
[('Schulen', ' ', 'werden', ' ', 'zu', ' ', 'gr', 'ö', 'ß', 'erem', ' ', 'Fokus', ' ', 'auf', ' ', 'Mathematik', ',', ' ', 'Rechtschreibung', ' ', 'und', ' ', 'Grammatik', ' ', 'angehalten', '\n'), ('In', ' ', 'Kursen', ' ', 'zu', ' ', 'englischer', ' ', 'Literatur', ' ', 'm', 'ü', 'ssen', ' ', 'Sch', 'ü', 'ler', ' ', 'k', 'ü', 'nftig', ' ', 'mindestens', ' ', 'ein', ' ', 'St', 'ü', 'ck', ' ', 'von', ' ', 'Shakespeare', ',', ' ', 'einen', ' ', 'Roman', ' ', 'des', ' ', '1', '9', '.', ' ', 'Jahrhunderts', ',', ' ', 'romantische', ' ', 'Lyrik', ' ', 'und', ' ', 'zeitgen', 'ö', 'ssische', ' ', 'britische', ' ', 'Romane', ' ', 'ab', ' ', '1', '9', '1', '4', ' ', 'behandeln', '.', '\n'), ('In', ' ', 'die', ' ', 'Pr', 'ü', 'fung', ' ', 'finden', ' ', 'auch', ' ', '„', 'ungesehene', ' ', 'Texte', '"', ' ', 'Eingang', ',', ' ', 'um', ' ', 'zu', ' ', 'breiterem', ' ', 'Lesen', ' ', 'anzuregen', '.', '\n'), ('Der', ' ', 'kombinierte', ' ', 'Kurs', ' ', '

In [3]:
# target

with open("wmt14-de-en.ref", encoding = "utf-8") as file:
    target = file.readlines()
print("target lines", len(target))

target_tokens = [re.findall(r"[a-zA-Z]+|[^a-zA-Z]", text) for text in target]


target_vocabulary = Vocabulary(sum(target_tokens, start = []))
print("target_vocabulary", len(target_vocabulary))

target_tokens = [target_vocabulary.lookup(l) for l in target_tokens]
print(len(target_tokens))

print(target_tokens[:5])

target lines 3003
target_vocabulary 9943
3003
[('Schools', ' ', 'urged', ' ', 'to', ' ', 'focus', ' ', 'more', ' ', 'on', ' ', 'maths', ',', ' ', 'spelling', ' ', 'and', ' ', 'grammar', '\n'), ('English', ' ', 'literature', ' ', 'courses', ' ', 'will', ' ', 'require', ' ', 'pupils', ' ', 'to', ' ', 'study', ' ', 'at', ' ', 'least', ' ', 'one', ' ', 'Shakespeare', ' ', 'play', ',', ' ', 'a', ' ', '1', '9', 'th', ' ', 'century', ' ', 'novel', ',', ' ', 'Romantic', ' ', 'poetry', ' ', 'and', ' ', 'contemporary', ' ', 'British', ' ', 'fiction', ' ', 'from', ' ', '1', '9', '1', '4', ' ', 'onwards', '.', '\n'), ('The', ' ', 'exam', ' ', 'will', ' ', 'also', ' ', 'feature', ' ', '"', 'unseen', ' ', 'texts', '"', ' ', 'to', ' ', 'encourage', ' ', 'wider', ' ', 'reading', ';', '\n'), ('A', ' ', 'combined', ' ', 'English', ' ', 'literature', ' ', 'and', ' ', 'language', ' ', 'course', ' ', 'will', ' ', 'be', ' ', 'scrapped', '.', '\n'), ('From', ' ', '2', '0', '1', '5', ',', ' ', 'pupils', ' ', 

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [5]:
importlib.reload(seq2seq)  

# net = seq2seq.LSTM(source_vocabulary, target_vocabulary)
net = seq2seq.Transformer(source_vocabulary, target_vocabulary, max_sequence_length = 170, embedding_dimension = 128, feedforward_dimension=512, attention_heads = 8)
net.to(device)

source_tensor = net.text2tensor(source_tokens)
print("source", source_tensor.shape)

target_tensor = net.text2tensor(target_tokens, vocabulary = net.out2i)
print("target", target_tensor.shape)

performance = net.fit(source_tensor, target_tensor, save_path = "model.pt", batch_size = 10, epochs = 20)
net.save_architecture("model.arch")

Model: Transformer
Tokens in the input vocabulary: 13,281
Tokens in the output vocabulary: 9,943
Max sequence length: 170
Embedding dimension: 128
Feedforward dimension: 512
Encoder layers: 2
Decoder layers: 2
Attention heads: 8
Activation: relu
Dropout: 0.0
Trainable parameters: 5,204,442

source torch.Size([3003, 157])
target torch.Size([3003, 161])


  0%|          | 0/20 [00:00<?, ?it/s]

Training started
Epochs: 20
Learning rate: 0.0001
Weight decay: 0
Epoch | Train                 | Minutes
      | Loss     | Error Rate |
---------------------------------------


  0%|          | 0/301 [00:00<?, ?it/s]

    1 |   5.2672 |     88.074 |     0.7


  0%|          | 0/301 [00:00<?, ?it/s]

    2 |   3.8758 |     86.236 |     1.5


  0%|          | 0/301 [00:00<?, ?it/s]

    3 |   3.6879 |     86.162 |     2.2


  0%|          | 0/301 [00:00<?, ?it/s]

    4 |   3.6073 |     86.107 |     2.9


  0%|          | 0/301 [00:00<?, ?it/s]

    5 |   3.5505 |     86.043 |     3.6


  0%|          | 0/301 [00:00<?, ?it/s]

    6 |   3.5122 |     85.993 |     4.3


  0%|          | 0/301 [00:00<?, ?it/s]

    7 |   3.4703 |     85.939 |     5.1


  0%|          | 0/301 [00:00<?, ?it/s]

    8 |   3.4305 |     85.867 |     5.9


  0%|          | 0/301 [00:00<?, ?it/s]

    9 |   3.3868 |     85.813 |     6.6


  0%|          | 0/301 [00:00<?, ?it/s]

   10 |   3.3409 |     85.732 |     7.4


  0%|          | 0/301 [00:00<?, ?it/s]

   11 |   3.2968 |     85.657 |     8.2


  0%|          | 0/301 [00:00<?, ?it/s]

   12 |   3.2518 |     85.586 |     9.0


  0%|          | 0/301 [00:00<?, ?it/s]

   13 |   3.2001 |     85.491 |     9.8


  0%|          | 0/301 [00:00<?, ?it/s]

   14 |   3.1494 |     85.406 |    10.5


  0%|          | 0/301 [00:00<?, ?it/s]

   15 |   3.0971 |     85.309 |    11.3


  0%|          | 0/301 [00:00<?, ?it/s]

   16 |   3.0425 |     85.220 |    12.1


  0%|          | 0/301 [00:00<?, ?it/s]

   17 |   2.9886 |     85.113 |    12.9


  0%|          | 0/301 [00:00<?, ?it/s]

   18 |   2.9320 |     84.999 |    13.7


  0%|          | 0/301 [00:00<?, ?it/s]

   19 |   2.8740 |     84.889 |    14.4


  0%|          | 0/301 [00:00<?, ?it/s]

   20 |   2.8162 |     84.781 |    15.2


In [6]:
# includes all the information about the epoch and the model, useful for reproducibility

performance

Unnamed: 0,epoch,train_loss,train_error_rate,training_minutes,learning_rate,weight_decay,model,max_sequence_length,embedding_dimension,feedforward_dimension,encoder_layers,decoder_layers,attention_heads,activation,dropout,parameters
0,1,5.267163,88.074426,0.735744,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
1,2,3.87579,86.236472,1.45377,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
2,3,3.687935,86.161547,2.162948,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
3,4,3.607276,86.106602,2.875041,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
4,5,3.550507,86.043332,3.587308,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
5,6,3.512205,85.992549,4.321624,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
6,7,3.470341,85.939477,5.089064,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
7,8,3.43046,85.867466,5.86678,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
8,9,3.386772,85.813353,6.643522,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442
9,10,3.340865,85.731768,7.422444,0.0001,0,Transformer,170,128,512,2,2,8,relu,0.0,5204442


In [7]:
# the input for testing

net.tensor2text(source_tensor[:5], vocabulary = net.i2in)

['<START>Schulen werden zu größerem Fokus auf Mathematik, Rechtschreibung und Grammatik angehalten\n<END>',
 '<START>In Kursen zu englischer Literatur müssen Schüler künftig mindestens ein Stück von Shakespeare, einen Roman des 19. Jahrhunderts, romantische Lyrik und zeitgenössische britische Romane ab 1914 behandeln.\n<END>',
 '<START>In die Prüfung finden auch „ungesehene Texte" Eingang, um zu breiterem Lesen anzuregen.\n<END>',
 '<START>Der kombinierte Kurs aus englischer Literatur und Sprache wird abgeschafft.\n<END>',
 '<START>Ab 2015 müssen Schüler eine eigenständige GCSE-Prüfung für Sprache ablegen, wobei es starke Anreize dafür gibt, englische Literatur als separate Qualifikation zu wählen.\n<END>']

# predict

In [8]:
importlib.reload(seq2seq)    
net = seq2seq.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)

idx, log_probabilities = net.predict(source_tensor[:5], main_progress_bar = False, progress_bar = 0)

net.tensor2text(idx)

Model: Transformer
Tokens in the input vocabulary: 13,281
Tokens in the output vocabulary: 9,943
Max sequence length: 170
Embedding dimension: 128
Feedforward dimension: 512
Encoder layers: 2
Decoder layers: 2
Attention heads: 8
Activation: relu
Dropout: 0.0
Trainable parameters: 5,204,442



['<START>The \n<END>',
 '<START>In addition, 2, the 19770 and to the 170 and 1977777777777770 and to 10 a and the of 177.\n<END>',
 '<START>Mr said: " said to \n<END>',
 '<START>The firefighters is is and be be to the \n<END>',
 '<START>In 2001, the case of the town of the 10150012001501, to to a of to of to the of a <END>']

# greedy_search

In [9]:
importlib.reload(seq2seq)    
net = seq2seq.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.greedy_search(source_tensor[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Transformer
Tokens in the input vocabulary: 13,281
Tokens in the output vocabulary: 9,943
Max sequence length: 170
Embedding dimension: 128
Feedforward dimension: 512
Encoder layers: 2
Decoder layers: 2
Attention heads: 8
Activation: relu
Dropout: 0.0
Trainable parameters: 5,204,442

tensor([-29.2347, -34.9740, -28.5281, -32.5123, -28.4796], device='cuda:0')


['<START>The \n<END>',
 '<START>The 19-year-old of the 19 of a ',
 '<START>He said the "The to \n<END>',
 '<START>The firefighters is and be a team of the tournament ',
 '<START>The 2010, is in the country of the ']

# sample

In [10]:
importlib.reload(seq2seq)    
net = seq2seq.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.sample(source_tensor[:5], progress_bar = False)

print(log_probabilities)
net.tensor2text(indexes)

Model: Transformer
Tokens in the input vocabulary: 13,281
Tokens in the output vocabulary: 9,943
Max sequence length: 170
Embedding dimension: 128
Feedforward dimension: 512
Encoder layers: 2
Decoder layers: 2
Attention heads: 8
Activation: relu
Dropout: 0.0
Trainable parameters: 5,204,442

tensor([-79.1392, -84.9124, -81.9127, -63.3095, -59.9535], device='cuda:0')


['<START>The Winter meet Boeing and drawn nurseries it its-Hare ',
 '<START>Two half groundwater9987-every-outlets is laws some ',
 '<START>Uli day "head she first with took The declare from',
 '<START>The friendly waits and released Treubel own week and deeper ',
 '<START>One Health said Asia only Thursday in been a Vegetarian ']

# beam_search

In [11]:
importlib.reload(seq2seq)    
net = seq2seq.load_architecture("model.arch")
net.load_state_dict(torch.load("model.pt"))
net.to(device)
indexes, log_probabilities = net.beam_search(source_tensor[:5], progress_bar = 0)

print(log_probabilities)
[net.tensor2text(t) for t in indexes]

Model: Transformer
Tokens in the input vocabulary: 13,281
Tokens in the output vocabulary: 9,943
Max sequence length: 170
Embedding dimension: 128
Feedforward dimension: 512
Encoder layers: 2
Decoder layers: 2
Attention heads: 8
Activation: relu
Dropout: 0.0
Trainable parameters: 5,204,442

tensor([[-29.0091, -29.7633, -29.9004, -30.0021, -30.2870],
        [-29.7181, -29.8997, -30.2440, -32.3113, -32.3713],
        [-23.7280, -24.6271, -25.4958, -25.5079, -25.5574],
        [-28.5049, -28.6433, -28.6560, -28.8708, -29.2933],
        [-30.2971, -30.6782, -31.4014, -31.4419, -31.4628]], device='cuda:0')


[['<START>The \n<END>',
  '<START>The \n<END>',
  '<START>The \n<END>',
  '<START>The \n<END>',
  '<START>The \n<END>'],
 ['<START>In addition, 2, the 19770 of the ',
  '<START>In addition, 2, the 19770 of a ',
  '<START>In addition, 2, the 19770 and to ',
  '<START>In addition, 2, the 1977 and to the',
  '<START>In addition, 2, the 1977 and to 1'],
 ['<START>Mr said: " said to \n<END>',
  '<START>Mr said: " said to \n<END>',
  '<START>Mr said: " said to \n<END>',
  '<START>Mr said: " said to \n<END>',
  '<START>Mr said: " said to \n<END>'],
 ['<START>The firefighters is is and be of the of and ',
  '<START>The firefighters is is and be be to the of ',
  '<START>The firefighters is is and be be to the \n<END>',
  '<START>The firefighters is is and be be to the and ',
  '<START>The firefighters is is and be be to the tournament '],
 ['<START>In 2001, the case of the town of ',
  '<START>In 2012, the was of the town of ',
  '<START>In 2012, the was of the of the ',
  '<START>In 2001, the