# Evaluate Translations

In [55]:
_DATASETS = [
    "xstory_cloze",
    "xcopa",
    "xnli",
    "paws-x",
    "mgsm",
]

_MODELS = [
    "nllb-200-distilled-600M",
    "nllb-200-distilled-1.3B",
    "nllb-200-1.3B",
    "nllb-200-3.3B",
    "xglm-564M",
    "xglm-1.7B",
    "xglm-2.9B",
    #"xglm-4.5B",
    "xglm-7.5B",
    "bloom-560m",
    #"bloom-1b1",
    "bloom-1b7",
    "bloom-3b",
    "bloom-7b1",
    "llama-7B",
    "llama-13B",
    "llama-30B",
    # "llama-65B",
    #"RedPajama-INCITE-Base-3B-v1",
    #"RedPajama-INCITE-7B-Base",
    #"open_llama_3b",
    #"open_llama_7b",
    #"open_llama_13b",
]

In [56]:
"""_MODELS = [
    "nllb-200-3.3B",
    "xglm-564M",
    "xglm-1.7B",
    "xglm-2.9B",
    "xglm-7.5B",
]"""

model_sizes = {
    "nllb": [0.6, 1.3, 1.3, 3.3],
    "xglm": [0.6, 1.7, 2.9, 7.5],
    "bloom": [0.6, 1.7, 3.0, 7.1],
    "llama": [7.0, 13.0, 30.0], 
}

model_names_all = ["nllb"] * 4 + ["xglm"] * 4 + ["bloom"] * 4 + ["llama"] * 3

# list including all models
model_sizes_all = []
for sizes in model_sizes.values():
    model_sizes_all += sizes

In [57]:
# mikel
languages = {
    "high": ["es", "ru", "zh", "it", "id"],
    "low": ["sw", "hi", "ur", "my", "ta", "ht"]
}

# xglm
languages = {
    "high": ["ru", "zh", "de", "es", "fr", "ja"],
    # "medium": ["it", "pt", "el", "ko", "fi", "id", "tr", "ar", "vi", "th", "bg", "ca"],
    "low": ["hi", "et", "bn", "ta", "ur", "sw", "te", "eu", "my", "ht", "qu"],
    # "ex_low": ["eu", "my", "ht", "qu"]
}

languages = {}

In [58]:
# Get results from metrics folder

from collections import defaultdict
import json

def get_metrics():
    metrics_dict = defaultdict(dict)
    for dataset_name in _DATASETS:
        for model_name in _MODELS:
            if model_name == "bloom-560m" and dataset_name == "xnli":
                with open(f"metrics/{dataset_name}/bloom-1b1.json") as f:
                    metrics_dict[dataset_name][model_name] = json.load(f)
            else:
                with open(f"metrics/{dataset_name}/{model_name}.json") as f:
                    metrics_dict[dataset_name][model_name] = json.load(f)
            for language in metrics_dict[dataset_name][model_name]:
                avg = defaultdict(float)
                for field in metrics_dict[dataset_name][model_name][language]:
                    for metric, value in metrics_dict[dataset_name][model_name][language][field].items():
                        avg[metric] += value
                for metric in avg:
                    avg[metric] /= len(metrics_dict[dataset_name][model_name][language])
                    avg[metric] = round(avg[metric], 2)
                metrics_dict[dataset_name][model_name][language]["avg"] = dict(avg)
                        
                
    return dict(metrics_dict)
                
metrics_dict = get_metrics()

In [59]:
from copy import deepcopy

def add_avg(metrics_dict):
    metrics_dict_split = defaultdict(dict)

    for metric in ["sacrebleu", "chrf++", "comet"]:
        metrics_dict_split[metric] = deepcopy(metrics_dict)
        for dataset_name in metrics_dict:
            for model_name in metrics_dict[dataset_name]:
                """
                if model_name == "bloom-560m" and dataset_name == "xnli":
                    continue
                """
                for language, language_dict in metrics_dict[dataset_name][model_name].items():
                    avg = metrics_dict[dataset_name][model_name][language]["avg"]
                    metrics_dict_split[metric][dataset_name][model_name][language] = avg.get(metric, 0)
                metrics_dict_split[metric][dataset_name][model_name]["avg"] = round(sum(metrics_dict_split[metric][dataset_name][model_name].values()) / len(metrics_dict_split[metric][dataset_name][model_name]), 2)
                
                items = metrics_dict_split[metric][dataset_name][model_name]
                values = items.values()
                metrics_dict_split[metric][dataset_name][model_name]["avg"] = round(sum(values) / len(values), 1)
                for resource, langs in languages.items():
                    values = [v for k, v in items.items() if k in langs]
                    if len(values) > 0:
                        metrics_dict_split[metric][dataset_name][model_name][resource] = round(sum(values) / len(values), 1)
    return dict(metrics_dict_split)

metrics_dict_split = add_avg(metrics_dict)


In [60]:
import matplotlib.pyplot as plt

def plot_size_df_datasets(df, model_name, title, langs=False):
    df.set_index("size", inplace=True)
    # select only the model
    df_model = df[df["model"] == model_name]
    for average in ["avg"] + list(languages.keys()):
        if average not in df.columns:
            continue
        df_model[average].plot(
            x="size", y="acc", title=f"{title} {average}", ylabel="Average COMET", xlabel="Model size (B)", legend=True, marker="o", label="Self-translate", color="C2"
        )
        # add an horizontal line with NLLB-200-3.3B
        plt.axhline(y=df.loc[3.3][average], color="C1", linestyle="--", label="MT (NLLB)")
        plt.legend()
        plt.xscale("log")
        # vertical scale from 0 to 45
        #plt.ylim(0, 45)
        plt.ylim(55, 90)
        plt.xticks(model_sizes[model_name], model_sizes[model_name], rotation="vertical");
        if title == "":
            plt.savefig(f"plots/{average}.pdf", bbox_inches='tight')
        plt.show()
    # iterate all langs
    if langs:
        for lang in df.columns:
            if lang in ["dataset", "avg", "size"]:
                continue
            df[lang].plot(x="size", y="acc", title=f"{title}_{lang}", ylabel="BLEU", xlabel="Model size (B)", legend=True, marker="o")
            plt.xscale("log")
            plt.xticks(model_sizes[model_name], model_sizes[model_name], rotation="vertical");
            plt.show()

In [64]:
# create plots for each dataset and model

import matplotlib.pyplot as plt
import pandas as pd

def get_dataframes_model(metrics_dict_split, model_name):
    for metric in ["sacrebleu"]: # ["sacrebleu", "chrf++", "comet"]
        df_avg = {}
        for average in ["avg"] + list(languages.keys()):
            df_avg[average] = pd.DataFrame({"model": _MODELS}, index=_MODELS)
        for dataset_name in metrics_dict_split[metric]:
            df = pd.DataFrame(metrics_dict_split[metric][dataset_name]).T
            for average in ["avg"] + list(languages.keys()):
                df_avg[average][dataset_name] = df[average]
            df["model"] = model_names_all
            df["size"] = model_sizes_all
            df = df.reindex(columns=["model", "size"] + [col for col in df.columns if col not in ["model", "size"]])
            display(df)
            print(df.to_latex(index=False))
            # x labels vertical
            # plot_size_df_datasets(df, model_name, f"{dataset_name} {metric}")
        for average in ["avg"] + list(languages.keys()):
            df_avg[average][average] = df_avg[average].mean(axis=1).round(1)
            df_avg[average]["model"] = model_names_all
            df_avg[average]["size"] = model_sizes_all
            df_avg[average] = df_avg[average].reindex(columns=["model", "size"] + [col for col in df_avg[average].columns if col not in ["model", "size"]])
            display(df_avg[average])
            print(df_avg[average].to_latex(index=False))
            # plot_size_df_datasets(df_avg[average], model_name, title="")

get_dataframes_model(metrics_dict_split, "xglm")

Unnamed: 0,model,size,ru,zh,es,ar,hi,id,te,sw,eu,my,avg
nllb-200-distilled-600M,nllb,0.6,40.98,30.04,47.98,49.46,45.07,38.44,29.45,41.51,35.24,22.0,38.0
nllb-200-distilled-1.3B,nllb,1.3,44.12,30.57,50.52,53.09,48.62,40.98,32.19,43.86,33.77,28.18,40.6
nllb-200-1.3B,nllb,1.3,43.22,32.07,50.42,52.91,48.08,41.13,31.39,44.17,35.63,29.94,40.9
nllb-200-3.3B,nllb,3.3,44.59,34.8,51.33,54.8,49.16,42.27,33.09,45.0,33.55,29.69,41.8
xglm-564M,xglm,0.6,15.67,1.54,14.36,6.16,7.52,16.92,1.28,3.82,2.81,0.67,7.1
xglm-1.7B,xglm,1.7,25.62,16.08,28.64,21.4,16.22,26.07,10.46,21.17,11.38,7.94,18.5
xglm-2.9B,xglm,2.9,29.08,21.68,36.22,26.32,24.91,28.86,11.37,27.19,20.04,12.4,23.8
xglm-7.5B,xglm,7.5,34.4,25.2,40.85,34.45,30.32,33.59,17.05,33.48,23.33,16.84,29.0
bloom-560m,bloom,0.6,0.37,9.67,20.55,14.7,9.94,19.55,1.93,0.43,1.96,0.11,7.9
bloom-1b7,bloom,1.7,9.03,22.26,35.84,26.14,18.45,27.74,9.01,12.67,11.56,0.06,17.3


\begin{tabular}{lrrrrrrrrrrrr}
\toprule
model &  size &    ru &    zh &    es &    ar &    hi &    id &    te &    sw &    eu &    my &  avg \\
\midrule
 nllb &   0.6 & 40.98 & 30.04 & 47.98 & 49.46 & 45.07 & 38.44 & 29.45 & 41.51 & 35.24 & 22.00 & 38.0 \\
 nllb &   1.3 & 44.12 & 30.57 & 50.52 & 53.09 & 48.62 & 40.98 & 32.19 & 43.86 & 33.77 & 28.18 & 40.6 \\
 nllb &   1.3 & 43.22 & 32.07 & 50.42 & 52.91 & 48.08 & 41.13 & 31.39 & 44.17 & 35.63 & 29.94 & 40.9 \\
 nllb &   3.3 & 44.59 & 34.80 & 51.33 & 54.80 & 49.16 & 42.27 & 33.09 & 45.00 & 33.55 & 29.69 & 41.8 \\
 xglm &   0.6 & 15.67 &  1.54 & 14.36 &  6.16 &  7.52 & 16.92 &  1.28 &  3.82 &  2.81 &  0.67 &  7.1 \\
 xglm &   1.7 & 25.62 & 16.08 & 28.64 & 21.40 & 16.22 & 26.07 & 10.46 & 21.17 & 11.38 &  7.94 & 18.5 \\
 xglm &   2.9 & 29.08 & 21.68 & 36.22 & 26.32 & 24.91 & 28.86 & 11.37 & 27.19 & 20.04 & 12.40 & 23.8 \\
 xglm &   7.5 & 34.40 & 25.20 & 40.85 & 34.45 & 30.32 & 33.59 & 17.05 & 33.48 & 23.33 & 16.84 & 29.0 \\
bloom &   0.6 &

  print(df.to_latex(index=False))


Unnamed: 0,model,size,et,ht,it,id,qu,sw,zh,ta,th,tr,vi,avg
nllb-200-distilled-600M,nllb,0.6,39.07,33.85,45.88,33.15,9.26,32.29,35.16,32.33,21.23,37.66,32.81,32.1
nllb-200-distilled-1.3B,nllb,1.3,45.42,40.4,51.01,37.41,12.02,35.57,38.2,37.47,24.75,42.61,37.47,36.6
nllb-200-1.3B,nllb,1.3,43.75,38.26,50.93,37.22,10.48,35.39,38.52,37.36,23.36,40.93,35.67,35.6
nllb-200-3.3B,nllb,3.3,45.57,40.42,52.45,38.12,11.38,36.91,42.42,38.34,26.36,43.06,38.9,37.6
xglm-564M,xglm,0.6,12.08,9.37,10.06,12.99,0.35,2.96,0.92,2.29,7.67,4.62,8.73,6.5
xglm-1.7B,xglm,1.7,25.29,20.36,28.12,23.88,1.16,15.62,22.94,12.69,12.8,15.54,20.31,18.1
xglm-2.9B,xglm,2.9,34.93,25.21,32.88,27.51,1.91,21.7,29.21,17.77,22.52,22.32,29.36,24.1
xglm-7.5B,xglm,7.5,39.55,28.41,40.18,31.9,4.11,27.25,32.5,25.27,24.79,26.41,32.14,28.4
bloom-560m,bloom,0.6,0.09,0.22,2.4,16.07,0.17,0.11,13.7,4.35,0.08,0.1,15.63,4.8
bloom-1b7,bloom,1.7,0.24,0.59,13.94,25.17,0.37,6.59,28.91,12.37,0.08,0.2,27.26,10.5


\begin{tabular}{lrrrrrrrrrrrrr}
\toprule
model &  size &    et &    ht &    it &    id &    qu &    sw &    zh &    ta &    th &    tr &    vi &  avg \\
\midrule
 nllb &   0.6 & 39.07 & 33.85 & 45.88 & 33.15 &  9.26 & 32.29 & 35.16 & 32.33 & 21.23 & 37.66 & 32.81 & 32.1 \\
 nllb &   1.3 & 45.42 & 40.40 & 51.01 & 37.41 & 12.02 & 35.57 & 38.20 & 37.47 & 24.75 & 42.61 & 37.47 & 36.6 \\
 nllb &   1.3 & 43.75 & 38.26 & 50.93 & 37.22 & 10.48 & 35.39 & 38.52 & 37.36 & 23.36 & 40.93 & 35.67 & 35.6 \\
 nllb &   3.3 & 45.57 & 40.42 & 52.45 & 38.12 & 11.38 & 36.91 & 42.42 & 38.34 & 26.36 & 43.06 & 38.90 & 37.6 \\
 xglm &   0.6 & 12.08 &  9.37 & 10.06 & 12.99 &  0.35 &  2.96 &  0.92 &  2.29 &  7.67 &  4.62 &  8.73 &  6.5 \\
 xglm &   1.7 & 25.29 & 20.36 & 28.12 & 23.88 &  1.16 & 15.62 & 22.94 & 12.69 & 12.80 & 15.54 & 20.31 & 18.1 \\
 xglm &   2.9 & 34.93 & 25.21 & 32.88 & 27.51 &  1.91 & 21.70 & 29.21 & 17.77 & 22.52 & 22.32 & 29.36 & 24.1 \\
 xglm &   7.5 & 39.55 & 28.41 & 40.18 & 31.90 &  4.11 

  print(df.to_latex(index=False))


Unnamed: 0,model,size,ar,bg,de,el,es,fr,hi,ru,sw,th,tr,ur,vi,zh,avg
nllb-200-distilled-600M,nllb,0.6,37.99,41.39,44.65,46.13,50.92,45.09,38.09,31.41,34.09,28.16,36.28,30.61,39.1,27.71,38.0
nllb-200-distilled-1.3B,nllb,1.3,41.09,43.8,46.97,48.54,53.02,47.17,40.78,33.49,36.3,30.0,39.24,32.84,41.81,29.48,40.3
nllb-200-1.3B,nllb,1.3,40.56,43.62,46.69,48.37,53.05,46.81,40.4,33.36,36.45,29.9,39.0,32.28,41.41,29.52,40.1
nllb-200-3.3B,nllb,3.3,42.19,45.08,47.66,50.05,53.8,47.73,41.73,33.98,37.89,31.35,40.61,33.86,43.2,31.31,41.5
xglm-564M,xglm,0.6,5.54,17.83,19.91,14.67,17.56,20.52,5.91,12.07,4.97,7.25,4.38,4.5,8.85,1.67,10.4
xglm-1.7B,xglm,1.7,16.34,27.2,30.3,30.86,31.54,29.73,12.77,18.83,16.63,15.23,11.78,9.81,21.11,12.36,20.3
xglm-2.9B,xglm,2.9,19.63,30.91,34.54,35.14,34.76,32.98,17.96,22.45,20.83,17.68,15.09,13.58,24.71,16.84,24.1
xglm-7.5B,xglm,7.5,26.52,35.23,38.8,39.16,41.56,38.93,22.09,25.91,26.29,22.56,19.71,17.61,29.08,19.8,28.8
bloom-560m,bloom,0.6,17.71,1.35,12.21,1.08,33.99,33.08,12.62,2.1,4.35,0.92,0.9,7.53,22.3,14.71,11.8
bloom-1b7,bloom,1.7,21.61,3.34,16.19,2.71,37.73,36.64,15.36,8.77,10.58,1.07,1.21,10.26,26.12,16.82,14.9


\begin{tabular}{lrrrrrrrrrrrrrrrr}
\toprule
model &  size &    ar &    bg &    de &    el &    es &    fr &    hi &    ru &    sw &    th &    tr &    ur &    vi &    zh &  avg \\
\midrule
 nllb &   0.6 & 37.99 & 41.39 & 44.65 & 46.13 & 50.92 & 45.09 & 38.09 & 31.41 & 34.09 & 28.16 & 36.28 & 30.61 & 39.10 & 27.71 & 38.0 \\
 nllb &   1.3 & 41.09 & 43.80 & 46.97 & 48.54 & 53.02 & 47.17 & 40.78 & 33.49 & 36.30 & 30.00 & 39.24 & 32.84 & 41.81 & 29.48 & 40.3 \\
 nllb &   1.3 & 40.56 & 43.62 & 46.69 & 48.37 & 53.05 & 46.81 & 40.40 & 33.36 & 36.45 & 29.90 & 39.00 & 32.28 & 41.41 & 29.52 & 40.1 \\
 nllb &   3.3 & 42.19 & 45.08 & 47.66 & 50.05 & 53.80 & 47.73 & 41.73 & 33.98 & 37.89 & 31.35 & 40.61 & 33.86 & 43.20 & 31.31 & 41.5 \\
 xglm &   0.6 &  5.54 & 17.83 & 19.91 & 14.67 & 17.56 & 20.52 &  5.91 & 12.07 &  4.97 &  7.25 &  4.38 &  4.50 &  8.85 &  1.67 & 10.4 \\
 xglm &   1.7 & 16.34 & 27.20 & 30.30 & 30.86 & 31.54 & 29.73 & 12.77 & 18.83 & 16.63 & 15.23 & 11.78 &  9.81 & 21.11 & 12.36 & 20.

  print(df.to_latex(index=False))


Unnamed: 0,model,size,de,es,fr,ja,ko,zh,avg
nllb-200-distilled-600M,nllb,0.6,59.41,64.8,61.18,33.09,38.52,36.94,49.0
nllb-200-distilled-1.3B,nllb,1.3,60.52,65.56,62.66,37.53,41.48,40.08,51.3
nllb-200-1.3B,nllb,1.3,60.66,65.72,62.52,36.8,40.77,38.89,50.9
nllb-200-3.3B,nllb,3.3,61.19,66.02,62.91,38.12,41.97,41.21,51.9
xglm-564M,xglm,0.6,30.41,31.7,34.0,2.89,5.64,3.42,18.0
xglm-1.7B,xglm,1.7,44.35,47.33,43.03,9.13,14.64,11.34,28.3
xglm-2.9B,xglm,2.9,48.69,51.59,48.39,14.21,19.19,16.79,33.1
xglm-7.5B,xglm,7.5,51.22,54.58,53.12,18.27,24.89,20.09,37.0
bloom-560m,bloom,0.6,15.95,33.98,34.67,2.79,1.06,8.69,16.2
bloom-1b7,bloom,1.7,32.25,50.68,49.56,7.38,5.61,17.85,27.2


\begin{tabular}{lrrrrrrrr}
\toprule
model &  size &    de &    es &    fr &    ja &    ko &    zh &  avg \\
\midrule
 nllb &   0.6 & 59.41 & 64.80 & 61.18 & 33.09 & 38.52 & 36.94 & 49.0 \\
 nllb &   1.3 & 60.52 & 65.56 & 62.66 & 37.53 & 41.48 & 40.08 & 51.3 \\
 nllb &   1.3 & 60.66 & 65.72 & 62.52 & 36.80 & 40.77 & 38.89 & 50.9 \\
 nllb &   3.3 & 61.19 & 66.02 & 62.91 & 38.12 & 41.97 & 41.21 & 51.9 \\
 xglm &   0.6 & 30.41 & 31.70 & 34.00 &  2.89 &  5.64 &  3.42 & 18.0 \\
 xglm &   1.7 & 44.35 & 47.33 & 43.03 &  9.13 & 14.64 & 11.34 & 28.3 \\
 xglm &   2.9 & 48.69 & 51.59 & 48.39 & 14.21 & 19.19 & 16.79 & 33.1 \\
 xglm &   7.5 & 51.22 & 54.58 & 53.12 & 18.27 & 24.89 & 20.09 & 37.0 \\
bloom &   0.6 & 15.95 & 33.98 & 34.67 &  2.79 &  1.06 &  8.69 & 16.2 \\
bloom &   1.7 & 32.25 & 50.68 & 49.56 &  7.38 &  5.61 & 17.85 & 27.2 \\
bloom &   3.0 & 39.59 & 54.56 & 53.02 & 11.09 &  6.83 & 21.66 & 31.1 \\
bloom &   7.1 & 45.61 & 58.41 & 56.59 & 15.89 & 12.61 & 27.48 & 36.1 \\
llama &   7.0 & 56.

  print(df.to_latex(index=False))


Unnamed: 0,model,size,es,fr,de,ru,zh,ja,th,sw,bn,te,avg
nllb-200-distilled-600M,nllb,0.6,48.34,34.85,44.57,31.39,28.14,17.99,17.37,34.62,28.58,34.68,32.1
nllb-200-distilled-1.3B,nllb,1.3,57.94,44.44,54.21,45.11,33.23,29.69,19.62,46.91,40.8,41.54,41.3
nllb-200-1.3B,nllb,1.3,56.78,44.0,52.64,42.11,33.91,33.51,19.83,47.51,39.82,38.45,40.9
nllb-200-3.3B,nllb,3.3,57.91,44.26,53.41,44.85,38.44,35.59,24.3,51.37,42.89,44.02,43.7
xglm-564M,xglm,0.6,12.94,11.3,15.94,7.53,1.77,0.82,1.22,1.27,0.77,0.6,5.4
xglm-1.7B,xglm,1.7,36.77,24.31,33.33,23.89,8.26,6.14,9.32,16.76,5.43,6.5,17.1
xglm-2.9B,xglm,2.9,44.5,32.7,40.77,33.2,13.25,14.41,10.71,24.7,11.8,9.28,23.5
xglm-7.5B,xglm,7.5,45.04,33.37,41.55,34.7,20.75,20.09,18.44,31.32,19.11,18.63,28.3
bloom-560m,bloom,0.6,19.4,13.29,4.75,0.38,7.83,1.14,0.06,0.67,4.33,1.97,5.4
bloom-1b7,bloom,1.7,28.14,25.34,17.91,9.39,15.72,5.4,0.14,7.56,9.1,7.23,12.6


\begin{tabular}{lrrrrrrrrrrrr}
\toprule
model &  size &    es &    fr &    de &    ru &    zh &    ja &    th &    sw &    bn &    te &  avg \\
\midrule
 nllb &   0.6 & 48.34 & 34.85 & 44.57 & 31.39 & 28.14 & 17.99 & 17.37 & 34.62 & 28.58 & 34.68 & 32.1 \\
 nllb &   1.3 & 57.94 & 44.44 & 54.21 & 45.11 & 33.23 & 29.69 & 19.62 & 46.91 & 40.80 & 41.54 & 41.3 \\
 nllb &   1.3 & 56.78 & 44.00 & 52.64 & 42.11 & 33.91 & 33.51 & 19.83 & 47.51 & 39.82 & 38.45 & 40.9 \\
 nllb &   3.3 & 57.91 & 44.26 & 53.41 & 44.85 & 38.44 & 35.59 & 24.30 & 51.37 & 42.89 & 44.02 & 43.7 \\
 xglm &   0.6 & 12.94 & 11.30 & 15.94 &  7.53 &  1.77 &  0.82 &  1.22 &  1.27 &  0.77 &  0.60 &  5.4 \\
 xglm &   1.7 & 36.77 & 24.31 & 33.33 & 23.89 &  8.26 &  6.14 &  9.32 & 16.76 &  5.43 &  6.50 & 17.1 \\
 xglm &   2.9 & 44.50 & 32.70 & 40.77 & 33.20 & 13.25 & 14.41 & 10.71 & 24.70 & 11.80 &  9.28 & 23.5 \\
 xglm &   7.5 & 45.04 & 33.37 & 41.55 & 34.70 & 20.75 & 20.09 & 18.44 & 31.32 & 19.11 & 18.63 & 28.3 \\
bloom &   0.6 &

  print(df.to_latex(index=False))
  df_avg[average][average] = df_avg[average].mean(axis=1).round(1)


Unnamed: 0,model,size,xstory_cloze,xcopa,xnli,paws-x,mgsm,avg
nllb-200-distilled-600M,nllb,0.6,38.0,32.1,38.0,49.0,32.1,37.8
nllb-200-distilled-1.3B,nllb,1.3,40.6,36.6,40.3,51.3,41.3,42.0
nllb-200-1.3B,nllb,1.3,40.9,35.6,40.1,50.9,40.9,41.7
nllb-200-3.3B,nllb,3.3,41.8,37.6,41.5,51.9,43.7,43.3
xglm-564M,xglm,0.6,7.1,6.5,10.4,18.0,5.4,9.5
xglm-1.7B,xglm,1.7,18.5,18.1,20.3,28.3,17.1,20.5
xglm-2.9B,xglm,2.9,23.8,24.1,24.1,33.1,23.5,25.7
xglm-7.5B,xglm,7.5,29.0,28.4,28.8,37.0,28.3,30.3
bloom-560m,bloom,0.6,7.9,4.8,11.8,16.2,5.4,9.2
bloom-1b7,bloom,1.7,17.3,10.5,14.9,27.2,12.6,16.5


\begin{tabular}{lrrrrrrr}
\toprule
model &  size &  xstory\_cloze &  xcopa &  xnli &  paws-x &  mgsm &  avg \\
\midrule
 nllb &   0.6 &          38.0 &   32.1 &  38.0 &    49.0 &  32.1 & 37.8 \\
 nllb &   1.3 &          40.6 &   36.6 &  40.3 &    51.3 &  41.3 & 42.0 \\
 nllb &   1.3 &          40.9 &   35.6 &  40.1 &    50.9 &  40.9 & 41.7 \\
 nllb &   3.3 &          41.8 &   37.6 &  41.5 &    51.9 &  43.7 & 43.3 \\
 xglm &   0.6 &           7.1 &    6.5 &  10.4 &    18.0 &   5.4 &  9.5 \\
 xglm &   1.7 &          18.5 &   18.1 &  20.3 &    28.3 &  17.1 & 20.5 \\
 xglm &   2.9 &          23.8 &   24.1 &  24.1 &    33.1 &  23.5 & 25.7 \\
 xglm &   7.5 &          29.0 &   28.4 &  28.8 &    37.0 &  28.3 & 30.3 \\
bloom &   0.6 &           7.9 &    4.8 &  11.8 &    16.2 &   5.4 &  9.2 \\
bloom &   1.7 &          17.3 &   10.5 &  14.9 &    27.2 &  12.6 & 16.5 \\
bloom &   3.0 &          20.2 &   13.0 &  17.1 &    31.1 &  20.3 & 20.3 \\
bloom &   7.1 &          25.2 &   16.5 &  21.4 &    36.

  print(df_avg[average].to_latex(index=False))
