### Load or generate token embeddings

In [None]:
import numpy as np
from tqdm import tqdm

from src import GenerateVectors
from src import CosineSimilarity

n_sents = 10_000
test_dataset = ["data/da-sv.txt/OpenSubtitles.da-sv.sv"]
outdir = "generated/"
save_prefix = "sv_10k"

model_name_1 = "models/opus-mt-NORTH_EU-NORTH_EU"
model_name_2 = "models/opus-mt-SCANDINAVIA-SCANDINAVIA"

token_embedings_1, token_embedings_2 = GenerateVectors.generate_or_load(test_dataset, n_sents, model_name_1, model_name_2, outdir, save_prefix)

print(token_embedings_1.shape)
print(token_embedings_2.shape)

In [None]:
import random
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

def train_and_eval(regressor, data_x, data_y, n_runs, n_samples):
    similarity_cos = 0
    
    for _ in range(n_runs):
        batch = random.sample(range(0, len(data_x)), n_samples + 10_000)
        train_ids = batch[:n_samples]
        test_ids = batch[n_samples:]
        
        if regressor == "MLP":
            model = MLPRegressor(
                random_state=1, hidden_layer_sizes=(8192)
            ).fit(data_x[train_ids], data_y[train_ids])
        elif regressor == "LR":
            model = LinearRegression().fit(data_x[train_ids], data_y[train_ids]) 
        elif regressor == "OR":
            model = OrthogonalRegression(use_orthogonal_projector=False).fit(data_x[train_ids], data_y[train_ids])
        
        similarity_cos += CosineSimilarity.cosine_similarity(model.predict(data_x[test_ids]), data_y[test_ids])
        
    return similarity_cos / n_runs

### Performe experiments to find the best regressor

In [None]:
def eval_regressor_on_different_ammount_of_data(regressor, n_samples):
    test_n = 10_000
    results_n = []
    results_score_cos = []

    while(n_samples + test_n < len(token_embedings_1)):
        embedings_1 = token_embedings_1[:n_samples]
        embedings_2 = token_embedings_2[:n_samples]

        score_cos = train_and_eval(regressor, token_embedings_1, token_embedings_2, 10, n_samples)

        results_n.append(n_samples)
        results_score_cos.append(score_cos)

        print(f"{n_samples}={round(score_cos, 3)}")
        n_samples *= 2

    print(results_n)
    print(results_score_cos)

eval_regressor_on_different_ammount_of_data("MLP", 100)

### Train the best regressor

In [None]:
train_n_samples = 10_000
train_ids = random.sample(range(0, len(token_embedings_1)), train_n_samples)

regressor = MLPRegressor(
    random_state=1, hidden_layer_sizes=(8192)
).fit(token_embedings_1[train_ids], token_embedings_2[train_ids])

### Save the MLP regressor

In [None]:
import jdata as jd

regressor_name = f"models/MLP_regressor_{GenerateVectors.get_output_file_name(save_prefix, model_name_1, model_name_2)}.json"

mlp_json ={
    "intercepts_": regressor.intercepts_,
    "coefs_": regressor.coefs_,
    "n_layers_": regressor.n_layers_,
    "out_activation_": regressor.out_activation_
}

jd.save(mlp_json, regressor_name)

### Load the trained MLP regressor

In [None]:
mlp_conf = jd.load(regressor_name)

loaded_regressor = MLPRegressor(random_state=1, hidden_layer_sizes=(8192))

loaded_regressor.intercepts_ = mlp_conf["intercepts_"]
loaded_regressor.coefs_ = mlp_conf["coefs_"]
loaded_regressor.n_layers_ = mlp_conf["n_layers_"]
loaded_regressor.out_activation_ = mlp_conf["out_activation_"]