In [1]:
import os
import re
import sys
sys.path.append('D:/projects/Torch/Translator vol.2')

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sacrebleu.metrics import BLEU
from rouge import Rouge
from tabulate import tabulate
from xformers.factory.model_factory import xFormer, xFormerConfig, xFormerEncoderConfig, xFormerDecoderConfig, xFormerEncoderBlock, xFormerDecoderBlock
from tokenizers import Tokenizer
from scripts.factory import FactoryModel
from scripts.transformer import Transformer
from scripts.utils import *
from scripts.dataset import *

# check gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Triton is not available, some optimizations will not be enabled.
Triton is not available, FusedMLP will not be enabled.
Either FairScale or torch distributed is not available, MixtureOfExperts will not be exposed. Please install them if you would like to use MoE


device(type='cuda')

In [2]:
EMB = 128
SEQ = 100
BATCH = 16
VOCAB = 64

my_config = [
    # A list of the encoder or decoder blocks which constitute the Transformer.
    # Note that a sequence of different encoder blocks can be used, same for decoders
    {
        "reversible": False,  # Optionally make these layers reversible, to save memory
        "block_type": "encoder",
        "num_layers": 6,  # Optional, this means that this config will repeat N times
        "dim_model": EMB,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": 1024,
            "vocab_size": VOCAB,
        },
        "multi_head_config": {
            "num_heads": 4,
            "residual_dropout": 0,
            "use_rotary_embeddings": True,
            "attention": {
                "name": "linformer",  # whatever attention mechanism
                "dropout": 0,
                "causal": False,
                "seq_len": SEQ,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": 0,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
    {
        "reversible": False,  # Optionally make these layers reversible, to save memory
        "block_type": "decoder",
        "num_layers": 6,  # Optional, this means that this config will repeat N times
        "dim_model": EMB,
        "residual_norm_style": "pre",  # Optional, pre/post
        "position_encoding_config": {
            "name": "vocab",  # whatever position encodinhg makes sense
            "seq_len": SEQ,
            "vocab_size": VOCAB,
        },
        "multi_head_config_masked": {
            "num_heads": 4,
            "residual_dropout": 0,
            "use_rotary_embeddings": True,
            "attention": {
                "name": "linformer",  # whatever attention mechanism
                "dropout": 0,
                "causal": True,
                "seq_len": SEQ,
            },
        },
        "multi_head_config_cross": {
            "num_heads": 4,
            "residual_dropout": 0,
            "attention": {
                "name": "linformer",  # whatever attention mechanism
                "dropout": 0,
                "causal": True,
                "seq_len": SEQ,
            },
        },
        "feedforward_config": {
            "name": "MLP",
            "dropout": 0,
            "activation": "relu",
            "hidden_layer_multiplier": 4,
        },
    },
]

# This part of xFormers is entirely type checked and needs a config object,
# could be changed in the future
config = xFormerConfig(my_config)
model = xFormer.from_config(config)
out = torch.nn.Linear(EMB, VOCAB)

loss_fn = nn.CrossEntropyLoss()

In [10]:
decay = []
no_decay = []
no_decay_layers = ["norm", "bias"]

for name, param in model.named_parameters():
    if any(nd in name for nd in no_decay_layers):
        no_decay.append(name)
    else:
        decay.append(name)

In [11]:
len(decay), len(no_decay)

(136, 156)

In [2]:
# # EMB = 128
# # SEQ = 100
# # BATCH = 16
# # VOCAB = 64
# x = torch.randint(0, VOCAB, (1, SEQ)).to(torch.int64)
# y = torch.randint(0, VOCAB, (1, SEQ + 1)).to(torch.int64)
# y_src, y_tgt = y[:, :-1], y[:, 1:]
# # o: (batch, seq, EMB)
# o = model(src=x, tgt=torch.tensor([[1]]).to(torch.int64))
# # o: (batch, seq, VOCAB)
# o = out(o)
# loss = loss_fn(o.permute(0,2,1), y_tgt)
# loss

In [3]:
# hyperparams = yaml_to_kwargs("../settings/classic_hyperparams.yaml")
# model = Translator(**hyperparams["model_hyperparams"], **hyperparams["model_training"])

tr = torch.load("../data/datasets/train-10000.pt")
dv = torch.load("../data/datasets/dev-10000.pt")
len(tr), len(dv)

(2771180, 593464)

In [20]:
def count_tokens(ds):
    src_tokens = 0
    tgt_tokens = 0
    for src, tgt in tqdm(ds, total=len(ds), desc="Counting tokens", ncols=100):
        src_tokens += torch.sum(src != 0).item()
        tgt_tokens += torch.sum((tgt != 0) & (tgt != 2) & (tgt != 3)).item()
    return src_tokens, tgt_tokens
src_tokens, tgt_tokens = count_tokens(tr)

Counting tokens: 100%|█████████████████████████████████| 2771180/2771180 [02:35<00:00, 17831.47it/s]


In [24]:
print("Model trained on: {:,} German tokens".format(src_tokens).replace(",", " "))
print("Model trained on: {:,} English tokens".format(tgt_tokens).replace(",", " "))

Model trained on: 88 082 172 German tokens
Model trained on: 83 294 345 English tokens


In [2]:
de_tok = Tokenizer.from_file("../tokenizers/de_tokenizer_10000.json")
en_tok = Tokenizer.from_file("../tokenizers/en_tokenizer_10000.json")
de = read_file("../data/train/train.de")
en = read_file("../data/train/train.en")

In [6]:
def calculate_token_word_ratio(de, en, de_tok, en_tok):
    for src, tgt in zip(de, en):#tqdm(zip(de, en), total=len(de), desc="Counting tokens", ncols=100):
        src_tokens = len(de_tok.encode(src).tokens)
        tgt_tokens = len(en_tok.encode(tgt).tokens)
        print(de)
        print(src_tokens, tgt_tokens)
        break
        src_words = len(src.split())
        tgt_words = len(tgt.split())

de_ratio, en_ratio = calculate_token_word_ratio(de, en, de_tok, en_tok)


In [20]:
min_len = 10
max_len = 100
unk_percentage = 0.15
length_tolerance = 2
de_hash_count = 0
en_hash_count = 0
de_count = 0
en_count = 0

for de_line, en_line in tqdm(zip(de, en), total=len(de), desc="Tokenizing", ncols=100):
        de_tokens = de_tok.encode(de_line).tokens
        en_tokens = en_tok.encode(en_line).tokens
        if (
            min_len >= len(de_tokens)
            or len(de_tokens) >= max_len
            or min_len >= len(en_tokens)
            or len(en_tokens) >= max_len
        ):
            continue
        if (
            de_tokens.count("[UNK]") / len(de_tokens) > unk_percentage
            or en_tokens.count("[UNK]") / len(en_tokens) > unk_percentage
        ):
            continue
        if max(len(de_tokens), len(en_tokens)) / min(len(de_tokens), len(en_tokens)) > length_tolerance:
            continue
        
        en_tokens = en_tokens[1:-1]

        de_hash_count += sum(1 for token in de_tokens if "##" in token)
        en_hash_count += sum(1 for token in en_tokens if "##" in token)
        de_count += len(de_tokens)
        en_count += len(en_tokens)

Tokenizing: 100%|███████████████████████████████████████| 3128188/3128188 [06:56<00:00, 7514.61it/s]


In [26]:
# words / tokens
print("German word:token ratio {:.2f}".format((de_count - de_hash_count) / de_count))
print("English word:token ratio {:.2f}".format((en_count - en_hash_count) / en_count))

German word:token ratio 0.71
English word:token ratio 0.90


In [2]:
de_tok = Tokenizer.from_file("../tokenizers/de_tokenizer_10000.json")
en_tok = Tokenizer.from_file("../tokenizers/en_tokenizer_10000.json")
models = "../inference_models/"
factory_checkpoint = "epoch=04-train_loss=0.6199-val_loss=0.5762_fct_44m.ckpt"
factory_model = FactoryModel.load_from_checkpoint(models + factory_checkpoint)
factory_model.eval();

checkpoint = "epoch=04-train_loss=0.6483-val_loss=0.5983_trs_18m.ckpt"
params = "../settings/transformer_18M.yaml"
model = load_model(models + checkpoint, params)
model.eval();

  rank_zero_warn(


In [14]:
@torch.no_grad()
def factory_translate(sentence, model, de_tok, en_tok, maxlen=100, device="cpu"):
    model.eval()
    de_tok.enable_padding(length=maxlen)
    x = torch.tensor(de_tok.encode(sentence, add_special_tokens=False).ids, device=device).unsqueeze(0)
    y = torch.zeros((1, maxlen), dtype=torch.long, device=device)
    y[0, 0] = en_tok.token_to_id("[SOS]")
    for i in range(1, maxlen):
        logits = model(src=x, tgt=y)
        token = logits[0, i-1].topk(1)[1].item()
        y[0, i] = token
        if token == en_tok.token_to_id("[EOS]"):
            break
    return en_tok.decode(y.tolist()[0])

@torch.no_grad()
def translate(sentence, model, de_tok, en_tok, maxlen=100, return_probs=False, device="cpu"):
    """
    Translates a sentence from a source language to a target language using a translation model.

    Args:
        sentence (str): The input sentence to be translated.
        model (torch.nn.Module): The translation model.
        de_tok (Tokenizer): The tokenizer for the source language.
        en_tok (Tokenizer): The tokenizer for the target language.
        maxlen (int, optional): The maximum length of the translated sentence. Defaults to 100.
        return_probs (bool, optional): Whether to return the probabilities of the tokens. Defaults to False.
        device (str, optional): The device to run the translation on. Defaults to "cpu".

    Returns:
        str: The translated sentence.
        list: The probabilities of the tokens.
    """
    model.eval()
    de_tok.enable_padding(length=maxlen)
    de_tok.enable_truncation(max_length=maxlen)
    x = torch.tensor(de_tok.encode(sentence, add_special_tokens=False).ids, device=device) 
    y = torch.tensor([[en_tok.token_to_id("[SOS]")]], dtype=torch.long, device=device)
    probs = []
    while y.size(1) < maxlen:
        tgt_mask = model._generate_square_subsequent_mask(y.size(1)).to(device)
        logits = model(x, y, tgt_mask=tgt_mask)
        prob, token = F.softmax(logits, dim=-1).topk(1)
        probs.append(prob[-1].item())
        token = torch.tensor([[token[-1].item()]], device=device)
        y = torch.cat((y, token), dim=1)
        if token == en_tok.token_to_id("[EOS]"):
            break
    translated = en_tok.decode(y.tolist()[0])
    if return_probs is True:
        return translated, probs
    return translated

translate("Ich bin ein Berliner.", model, de_tok, en_tok)

('i am a berliner.',
 [0.9458821415901184,
  0.8245789408683777,
  0.9143891334533691,
  0.9696773290634155,
  0.972100019454956,
  0.9485659599304199,
  0.9833275675773621])

In [3]:
s1 = "natürlich kann das kein argument seidenn wenn etwas passiersind große flächen verseucht ."
s2 = "2 und zur gleichen zeit hast du auch deine gabe verloreund dein a verstand hat sich verfinstert ."
s3 = "ich bitte die kommission um ihre stellungnahme zu den änderungsanträgen ."
s4 = "um welchen artikel der geschäftsordnung geht es ?"
s5 = "ich möchte sie bitten sich zu setzen ."
s6 = "zu einer kühnen konstruktion aus stahl und glas kontrastieren warme holzpanelen ."
s7 = "wir zielen mit unserer auffassung zur ständigen verbesserung durch anwendung von innovationen und technologien in unser system das beste für unsere kunden auso daß wir durch deckung aller technischen und technologischen bedürfnisse eine starke corporate identity beweisen ."
s8 = "viele fragen sind wirklich nervig aber keine ist besonders dumm ."
s9 = "Natürlich ist uns bewußdaß im Rahmen der Typprüfung 70/156, dass bisher keine Maßstäbe für die Erfassung der CO2-Emission für leichte Nutzfahrzeuge vorhanden sind."
s10 = "auch das liegt leider auf einer linie mit den abstimmungen der letzten wocheund wir bedauern das außerordentlich ."
s11 = "Bitte sprechen Sie während der Fahrt nicht mit dem Fahrer."
s12 = "Ich habe die Ehre, Sie zu einem Glas Wein einzuladen."
s13 = "Ich bin nicht sicher, ob ich das richtig verstanden habe."
s14 = "Die Sonne scheint und Kinder spielen draußen."
s15 = "im september wird die kommission ihre vorschläge für indikatoren vorlegeanhand derer wir ermitteln könnewie gut wir bei der erfüllung der in lissabon gesetzten ziele vorangekommen sind ."
s16 = "die idedie hinter der entwicklung des aulochrome am 10.09.2001 und dessen fertigstellung im jahre 2002 steckisdass ein saxophon nur einen ton gleichzeitig spielen kann ."
sentences = [
    "Obwohl es geregnet hat, sind wir trotzdem zum Strand gegangen.",
    "Die Veranstaltung war ein großer Erfolg, dank der hervorragenden Organisation.",
    "Ich habe meine Deutschkenntnisse verbessert, indem ich regelmäßig Bücher gelesen habe.",
    "Es ist wichtig, eine gesunde Work-Life-Balance zu haben, um Stress zu reduzieren.",
    "Mein Bruder hat sich für ein Stipendium beworben, damit er sein Studium finanzieren kann.",
    "Je mehr ich lerne, desto selbstbewusster werde ich in der Sprache.",
    "Der Film war so fesselnd, dass ich bis spät in die Nacht aufgeblieben bin, um ihn zu Ende zu sehen.",
    "Das Buch, das ich gerade lese, handelt von einer abenteuerlichen Reise um die Welt.", #,
    "Nachdem wir das Konzert besucht hatten, trafen wir uns mit Freunden zum Abendessen.", #,
    "Ich würde gerne eine Fremdsprache fließend sprechen können, um meine beruflichen Möglichkeiten zu erweitern."
]

translations = [
    "Although it was raining, we still went to the beach.",
    "The event was a great success, thanks to the excellent organization.",
    "I improved my German skills by regularly reading books.",
    "It is important to have a healthy work-life balance in order to reduce stress.",
    "My brother applied for a scholarship in order to finance his studies.",
    "The more I learn, the more confident I become in the language.",
    "The movie was so captivating that I stayed up late into the night to finish watching it.",
    "The book I'm currently reading is about an adventurous journey around the world.",
    "After attending the concert, we met up with friends for dinner.",
    "I would love to be able to speak a foreign language fluently in order to broaden my career opportunities."
]

In [37]:
factory_translate("Nachdem wir das Konzert besucht hatten trafen wir uns mit Freunden zum Abendessen.", factory_model, de_tok, en_tok, maxlen=100)

'after visiting the concert we met friends with dinner.'

In [30]:
for s, t in zip(sentences, translations):
    print(s)
    print(factory_translate(s, factory_model, de_tok, en_tok, maxlen=100))
    print(t, "\n")

Obwohl es geregnet hat, sind wir trotzdem zum Strand gegangen.
although it has been cleaned, we have gone to the beach.
Although it was raining, we still went to the beach. 

Die Veranstaltung war ein großer Erfolg, dank der hervorragenden Organisation.
the event was a great success, thanks to the excellent organization.
The event was a great success, thanks to the excellent organization. 

Ich habe meine Deutschkenntnisse verbessert, indem ich regelmäßig Bücher gelesen habe.
i have improved my german knowledge, and i have read books regularly.
I improved my German skills by regularly reading books. 

Es ist wichtig, eine gesunde Work-Life-Balance zu haben, um Stress zu reduzieren.
it is important to have a healthy work life balance, in order to reduce stress.
It is important to have a healthy work-life balance in order to reduce stress. 

Mein Bruder hat sich für ein Stipendium beworben, damit er sein Studium finanzieren kann.
my brother has been standing up for a scholarship, which w

In [109]:
# tokens = [token for token in de_tok.encode(s1, add_special_tokens=False).tokens if token != "[PAD]"]
# merged = [tokens[0]]
# for token in tokens:
#     if "##" in token:
#         merged[-1] += token[2:]
#     else:
#         merged.append(token)
# merged

In [25]:
tokens = [de_tok.encode(s, add_special_tokens=False).ids for s in [s1, s2, s3]]
packed = pack_sequence([torch.tensor(t) for t in tokens], enforce_sorted=False)
packed

PackedSequence(data=tensor([  14, 1283,  210,  105,  432, 1242,  274,  157,   96, 2957, 1006,  434,
         568, 8079,  250, 9952,  630,  439,  913, 1216, 4849,  228,  533,  121,
        3579, 1254,  144, 2801, 1911, 6977,   58, 7527,   10,  964,  218, 2107,
        1386,  177, 9705, 6954,  670,   26, 8277, 1226,   10,  167,  312,  195,
        4730, 3244,  714,   10]), batch_sizes=tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1]), sorted_indices=tensor([1, 0, 2]), unsorted_indices=tensor([1, 0, 2]))

In [29]:
pad_packed_sequence(packed, batch_first=True, padding_value=0)

(tensor([[1283,  432,  157, 1006, 8079,  630, 1216,  533, 1254, 1911, 7527,  218,
          1386, 9705,  670, 8277,   10,    0,    0,    0,    0,    0,    0,    0],
         [  14,  105,  274, 2957,  568, 9952,  913,  228, 3579, 2801,   58,  964,
          2107,  177, 6954,   26, 1226,  167,  312,  195, 4730, 3244,  714,   10],
         [ 210, 1242,   96,  434,  250,  439, 4849,  121,  144, 6977,   10,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]]),
 tensor([17, 24, 11]))

In [10]:
translate(s1, model, de_tok, en_tok).capitalize()

IndexError: tuple index out of range

In [9]:
folder = "../runs/vs-10000__act-gelu__n_layers-6__dp-0.1/betas-[0.9, 0.999]__wd-0.1__bs-100__max_ep-5__acc-1/"
checkpoint = "epoch=04-train_loss=0.6483-val_loss=0.5983.ckpt"
params = yaml_to_kwargs("../settings/hyperparams.yaml")
# model = Transformer.load_from_checkpoint(folder + checkpoint, **params["model_hyperparams"], **params["model_training"])
# model = Transformer(**params["model_hyperparams"], **params["model_training"])
model = FactoryModel(**params["model_hyperparams"], **params["model_training"])

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Model params: {:,}".format(params).replace(",", " "))

Model params: 103 740 416
