## Greedy generation with predicted vectors

### Load in the models

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import torch


models_path = "models/"
model_name_1 = "opus-mt-NORTH_EU-NORTH_EU"
model_name_2 = "opus-mt-SCANDINAVIA-SCANDINAVIA"

torch.set_grad_enabled(False)

tokenizer_1 = MarianTokenizer.from_pretrained(models_path + model_name_1)
model_1 = MarianMTModel.from_pretrained(models_path + model_name_1, output_hidden_states=True)

tokenizer_2 = MarianTokenizer.from_pretrained(models_path + model_name_2)
model_2 = MarianMTModel.from_pretrained(models_path + model_name_2, output_hidden_states=True)

### Use pretrained MLP regressor

In [None]:
import numpy as np
from sklearn.neural_network import MLPRegressor
import jdata as jd
import joblib

# Load the MLP conf from the json file
mlp_conf = jd.load("models/MLP_regressor_8192_sv_500k_opus-mt-NORTH_EU-NORTH_EU_to_opus-mt-SCANDINAVIA-SCANDINAVIA.json")

# Initialize MLP regressor
mlp_regressor = MLPRegressor(random_state=1, hidden_layer_sizes=(8192))

# Load in the regressor parameters and conf
mlp_regressor.intercepts_ = mlp_conf["intercepts_"]
mlp_regressor.coefs_ = mlp_conf["coefs_"]
mlp_regressor.n_layers_ = mlp_conf["n_layers_"]
mlp_regressor.out_activation_ = mlp_conf["out_activation_"]

### Generate

In [None]:
def get_encoded_str(tokenizer, model, input_str):
    # create ids of encoded input vectors
    input_ids = tokenizer(input_str, return_tensors="pt").input_ids
    # create BOS token
    bos_id = tokenizer("<pad>", add_special_tokens=False, return_tensors="pt").input_ids
    assert bos_id[0, 0].item() == model.config.decoder_start_token_id, "`decoder_input_ids` should correspond to `model.config.decoder_start_token_id`"

    # Get encoded sequence
    outputs = model(input_ids, decoder_input_ids=bos_id, return_dict=True)
    encoded_sequence = outputs.encoder_last_hidden_state
    
    return encoded_sequence

def greedy_generate(model, encoded_sequence, tokenizer):
    # Assign the BOS token as the first generated token for the decoder
    decoder_input_ids = tokenizer("<pad>", add_special_tokens=False, return_tensors="pt").input_ids

    # Generate next tokens in loop, pick next token by greedy search
    for _ in range(model_1.config.max_length):
        lm_logits = model(None, encoder_outputs=(encoded_sequence,), decoder_input_ids=decoder_input_ids, return_dict=True).logits
        next_decoder_input_ids = torch.argmax(lm_logits[:, -1:], axis=-1)
        decoder_input_ids = torch.cat([decoder_input_ids, next_decoder_input_ids], axis=-1)
        
        if decoder_input_ids[0][-1] == 0:
            return decoder_input_ids[0]
        
    #raise RuntimeError(f"Generation did not finish after max iter or {model_1.config.max_length}")
    print("Generation did not finish")
    return decoder_input_ids[0]
    
def generate_with_different_encoding(
    model_1, tokenizer_1, 
    model_2, tokenizer_2,
    predictor,
    input_str):
    """
    Uses the model 1 encoder to encode the string, then using the predictor, 
    converts it to suitable embeding for model 2 and generates translated text.
    
    The target language token and ending token for model 2 are not predicted.
    They are taken from model 2 encoding.
    """
    # Encode the string with both models
    encoded_sequence = get_encoded_str(tokenizer_1, model_1, input_str)
    encoded_sequence_2 = get_encoded_str(tokenizer_2, model_2, input_str)

    # Get the target language token encoded from model 2
    target_lang_token_encoded = encoded_sequence_2[:1, :1, :]
    # Get the ending token encoded from model 2
    ending_token_encoded = encoded_sequence_2[:1, -1:, :]

    # Predict the encoded sequence for model 2
    predicted_embedding = predictor.predict(encoded_sequence.detach().numpy()[0][1:-1])
    
    # Reshape the embeding and convert it to tensor from np array
    predicted_embedding = torch.Tensor(predicted_embedding.reshape(1, -1, encoded_sequence_2.shape[-1]))

    # Add encoded target language token
    predicted_embedding = torch.cat([target_lang_token_encoded, predicted_embedding], axis=1)
    # Add encoded ending token
    predicted_embedding = torch.cat([predicted_embedding, ending_token_encoded], axis=1)
    # Generate the translated string
    return greedy_generate(model_2, predicted_embedding, tokenizer_2)

In [None]:
input_str = ">>da<< Vad gör du?"

translated_model_1 = model_1.generate(**tokenizer_1(input_str, return_tensors="pt", padding=True), max_new_tokens=512)[0]
translated_model_2 = model_2.generate(**tokenizer_2(input_str, return_tensors="pt", padding=True), max_new_tokens=512)[0]

generated = generate_with_different_encoding(model_1, tokenizer_1, model_2, tokenizer_2, mlp_regressor, input_str)

print(f"Translated with model 1:           {tokenizer_1.decode(translated_model_1, skip_special_tokens=True)}")
print(f"Translated with model 2:           {tokenizer_2.decode(translated_model_2, skip_special_tokens=True)}")
print(f"Generated with predicted embeding: {tokenizer_2.decode(generated, skip_special_tokens=True)}")

### Mesure the performance of generation

In [None]:
from sacrebleu.metrics import BLEU

from src import DataLoader

def measure_translation_quality(lang_token, lang_1_path, lang_2_path, n_examples):
    lang_1, lang_2 = DataLoader.read_parallel_corpus(
        lang_1_path,
        lang_2_path,
        rows=1012)

    # Add target language token to input string
    lang_1 = [lang_token + i for i in lang_1]

    translated_model_1 = [
        tokenizer_1.decode(
            model_1.generate(**tokenizer_1(input_str, return_tensors="pt", padding=True), max_new_tokens=512)[0], 
            skip_special_tokens=True) 
        for input_str in lang_1]

    translated_model_2 = [
        tokenizer_2.decode(
            model_2.generate(**tokenizer_2(input_str, return_tensors="pt", padding=True), max_new_tokens=512)[0], 
            skip_special_tokens=True) 
        for input_str in lang_1]

    translated_predicted_embeddings = [
        tokenizer_2.decode(
           generate_with_different_encoding(model_1, tokenizer_1, model_2, tokenizer_2, mlp_regressor, input_str), 
            skip_special_tokens=True) 
        for input_str in lang_1]

    bleu = BLEU()

    print(f"Model 1 {model_name_1}: {bleu.corpus_score(translated_model_1, [lang_2])}")
    print(f"Model 2 {model_name_2}: {bleu.corpus_score(translated_model_2, [lang_2])}")
    print(f"Model 1 embedings transformed to model 2: {bleu.corpus_score(translated_predicted_embeddings, [lang_2])}")
    
    for i in range(n_examples):
        print()
        print(f"\t Input:     {lang_1[i]}")
        print(f"\t Real:      {lang_2[i]}")
        print(f"\t Model 1:   {translated_model_1[i]}")
        print(f"\t Model 2:   {translated_model_2[i]}")
        print(f"\t Predicted: {translated_predicted_embeddings[i]}")

In [None]:
print("Translating from Swedish to Danish on the OpenSubtitles dataset.")

measure_translation_quality(
    ">>da<< ",
    "data/da-sv.txt/OpenSubtitles.da-sv.sv", 
    "data/da-sv.txt/OpenSubtitles.da-sv.da",
    0
)

In [None]:
print("Translating from Swedish to Danish on the FLORES-200 dataset.")

measure_translation_quality(
    ">>da<<",
    "data/flores200_dataset/devtest/swe_Latn.devtest", 
    "data/flores200_dataset/devtest/dan_Latn.devtest",
    0
)

In [None]:
print("Translating from Danish to Swedish on the OpenSubtitles dataset.")

measure_translation_quality(
    ">>sv<< ",
    "data/da-sv.txt/OpenSubtitles.da-sv.da",
    "data/da-sv.txt/OpenSubtitles.da-sv.sv", 
    0
)

In [None]:
print("Translating from Danish to Swedish on the FLORES-200 dataset.")

measure_translation_quality(
    ">>sv<<",
    "data/flores200_dataset/devtest/dan_Latn.devtest",
    "data/flores200_dataset/devtest/swe_Latn.devtest",
    0
)