In [3]:
import numpy as np
import pandas as pd

import sys
sys.path.append("../../..")
from utils import utils, postprocessing_utils as post

In [12]:
results_dir = "../../../results/"
experiment = "acl"
short_model_name = "xlm-roberta"
task = "sentiment"
metric = "Macro_F1"

In [13]:
M = post.Metrics(results_dir, experiment, short_model_name, task, metric)
df = M.transfer_loss()
df[["Within-Score", "Cross-Score", "Transfer-Loss"]] *= 100
tables_path = "../../../data_exploration/{}/tables/".format(experiment)

# Basic stats
other = pd.read_excel(tables_path + "basic_stats_{}_{}.xlsx".format(task, short_model_name), 
                      usecols=["language", "train_avg_tokens", "test_avg_tokens"])
df = pd.merge(df, other[["language", "train_avg_tokens"]],
              how="left", left_on="Train-Language", right_on="language")
df = pd.merge(df, other[["language", "test_avg_tokens"]],
              how="left", left_on="Test-Language", right_on="language")


# Tokenizer stats
other = pd.read_excel(tables_path + "tokenizer_stats_{}_{}.xlsx".format(task, short_model_name))
df = pd.merge(df, other[["language", "avg_split_words_train(%)"]], 
              how="left", left_on="Train-Language", right_on="language")
df = pd.merge(df, other[["language", "avg_split_words_test(%)"]], 
              how="left", left_on="Test-Language", right_on="language")

# Sentiment balance
if task == "sentiment":
    other = pd.read_excel(tables_path + "sentiment_balance.xlsx")[["Language", "Ratio"]]
    other = other.rename(columns={"Ratio": "positive_frequency"})
    df = pd.merge(df, other, how="left", left_on="Train-Language", right_on="Language")
    df = pd.merge(df, other, how="left", left_on="Test-Language", right_on="Language", suffixes=("_trainlang", "_testlang"))

# Pretrain
other = pd.read_excel(tables_path + "pretrain_size.xlsx", usecols=["Language", "Pretrained"])
df = pd.merge(df, other, how="left", left_on="Train-Language", right_on="Language")
df = pd.merge(df, other, how="left", left_on="Test-Language", right_on="Language", suffixes=("_trainlang", "_testlang"))
    
# Intra/inter group
df["Transfer-Type"] = df.apply(lambda x: "Intra" if x["Train-Group"] == x["Test-Group"] else "Inter", axis=1)

# Drop unnecessary columns
df = df.drop(["language_x", 
              "language_y",
              "Language_trainlang", 
              "Language_testlang"], 
             axis=1)
    
df.to_excel("../../../analysis/stat_tests/{}/tables/full_table_{}_{}.xlsx".format(experiment, task, short_model_name), 
            index=False)