In [1]:
import numpy as np
from tqdm import tqdm

from src import GenerateVectors

n_sents = 1012

outdir = "generated/"

model_name_1 = "models/opus-mt-NORTH_EU-NORTH_EU"
model_name_2 = "models/opus-mt-SCANDINAVIA-SCANDINAVIA"

### Load or generate embeddings for FLORES-200 devtest dataset

In [2]:
def generate_or_load(test_dataset, save_prefix):
    return GenerateVectors.generate_or_load(
        test_dataset,
        n_sents,
        model_name_1,
        model_name_2,
        outdir,
        save_prefix)

sv_token_embedings_1, sv_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/swe_Latn.devtest"], "flores_sv")
da_token_embedings_1, da_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/dan_Latn.devtest"], "flores_da")
nb_token_embedings_1, nb_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/nob_Latn.devtest"], "flores_nb")
nn_token_embedings_1, nn_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/nno_Latn.devtest"], "flores_nn")
fo_token_embedings_1, fo_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/fao_Latn.devtest"], "flores_fo")
is_token_embedings_1, is_token_embedings_2 = generate_or_load(["data/flores200_dataset/devtest/isl_Latn.devtest"], "flores_is")

### Load in the regressor

In [3]:
import numpy as np
from sklearn.neural_network import MLPRegressor
import jdata as jd

# Load the MLP conf from the json file
mlp_conf = jd.load("models/MLP_regressor_8192_sv_500k_opus-mt-NORTH_EU-NORTH_EU_to_opus-mt-SCANDINAVIA-SCANDINAVIA.json")

# Initialize MLP regressor
mlp_regressor = MLPRegressor(random_state=1, hidden_layer_sizes=(8192))

# Load in the regressor parameters and conf
mlp_regressor.intercepts_ = mlp_conf["intercepts_"]
mlp_regressor.coefs_ = mlp_conf["coefs_"]
mlp_regressor.n_layers_ = mlp_conf["n_layers_"]
mlp_regressor.out_activation_ = mlp_conf["out_activation_"]

### Measure cosine similarity on FLORES-200 dataset

In [4]:
from src import CosineSimilarity

sv_cos_sim, sv_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(sv_token_embedings_1), sv_token_embedings_2)
da_cos_sim, da_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(da_token_embedings_1), da_token_embedings_2)
nb_cos_sim, nb_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(nb_token_embedings_1), nb_token_embedings_2)
nn_cos_sim, nn_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(nn_token_embedings_1), nn_token_embedings_2)
fo_cos_sim, fo_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(fo_token_embedings_1), fo_token_embedings_2)
is_cos_sim, is_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(is_token_embedings_1), is_token_embedings_2)

all_token_embeddings_1 = np.concatenate((sv_token_embedings_1,da_token_embedings_1,nb_token_embedings_1,nn_token_embedings_1,fo_token_embedings_1,is_token_embedings_1), axis=0)
all_token_embeddings_2 = np.concatenate((sv_token_embedings_2,da_token_embedings_2,nb_token_embedings_2,nn_token_embedings_2,fo_token_embedings_2,is_token_embedings_2), axis=0)

all_cos_sim, all_stdev = CosineSimilarity.cosine_similarity_with_stdev(mlp_regressor.predict(all_token_embeddings_1), all_token_embeddings_2)

print("Cosine similarity when testing MLP on Flores 200 devtest dataset.")

print(f"Swedish: {round(sv_cos_sim, 3)} ± {round(sv_stdev, 3)}")
print(f"Danish: {round(da_cos_sim, 3)} ± {round(da_stdev, 3)}")
print(f"Norwegian Bokmål: {round(nb_cos_sim, 3)} ± {round(nb_stdev, 3)}")
print(f"Norwegian Nynorsk: {round(nn_cos_sim, 3)} ± {round(nn_stdev, 3)}")
print(f"Faroese: {round(fo_cos_sim, 3)} ± {round(fo_stdev, 3)}")
print(f"Icelandic: {round(is_cos_sim, 3)} ± {round(is_stdev, 3)}")
print(f"Average: {round(all_cos_sim, 3)} ± {round(all_stdev, 3)}")

Cosine similarity when testing MLP on Flores 200 devtest dataset.
Swedish: 0.896 ± 0.075
Danish: 0.843 ± 0.109
Norwegian Bokmål: 0.846 ± 0.108
Norwegian Nynorsk: 0.829 ± 0.117
Faroese: 0.726 ± 0.128
Icelandic: 0.717 ± 0.132
Average: 0.782 ± 0.137


### Measure Euclidean distance on FLORES-200 dataset

In [5]:
from src import EuclideanDistance 

sv_euc_sim, sv_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(sv_token_embedings_1), sv_token_embedings_2)
da_euc_sim, da_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(da_token_embedings_1), da_token_embedings_2)
nb_euc_sim, nb_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(nb_token_embedings_1), nb_token_embedings_2)
nn_euc_sim, nn_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(nn_token_embedings_1), nn_token_embedings_2)
fo_euc_sim, fo_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(fo_token_embedings_1), fo_token_embedings_2)
is_euc_sim, is_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(is_token_embedings_1), is_token_embedings_2)

all_token_embeddings_1 = np.concatenate((sv_token_embedings_1,da_token_embedings_1,nb_token_embedings_1,nn_token_embedings_1,fo_token_embedings_1,is_token_embedings_1), axis=0)
all_token_embeddings_2 = np.concatenate((sv_token_embedings_2,da_token_embedings_2,nb_token_embedings_2,nn_token_embedings_2,fo_token_embedings_2,is_token_embedings_2), axis=0)

all_euc_sim, all_euc_stdev = EuclideanDistance.relative_euclidean_distance_stdev(mlp_regressor.predict(all_token_embeddings_1), all_token_embeddings_2)

print("Eucledian distance when testing MLP on Flores 200 devtest dataset.")

print(f"Swedish: {round(sv_euc_sim, 3)} ± {round(sv_euc_stdev, 3)}")
print(f"Danish: {round(da_euc_sim, 3)} ± {round(da_euc_stdev, 3)}")
print(f"Norwegian Bokmål: {round(nb_euc_sim, 3)} ± {round(nb_euc_stdev, 3)}")
print(f"Norwegian Nynorsk: {round(nn_euc_sim, 3)} ± {round(nn_euc_stdev, 3)}")
print(f"Faroese: {round(fo_euc_sim, 3)} ± {round(fo_euc_stdev, 3)}")
print(f"Icelandic: {round(is_euc_sim, 3)} ± {round(is_euc_stdev, 3)}")
print(f"Average: {round(all_euc_sim, 3)} ± {round(all_euc_stdev, 3)}")

Eucledian distance when testing MLP on Flores 200 devtest dataset.
Swedish: 0.493 ± 0.217
Danish: 0.641 ± 0.318
Norwegian Bokmål: 0.645 ± 0.335
Norwegian Nynorsk: 0.686 ± 0.343
Faroese: 0.963 ± 0.36
Icelandic: 0.996 ± 0.36
Average: 0.813 ± 0.387
