In [1]:
import pandas as pd
import numpy as np
import os
import json
from utils.read_conllu import Data

In [2]:
def load_json(path):
    with open(path, encoding="utf-8") as file:
        split_dict = json.load(file)
    return Data.from_json(split_dict)

In [3]:
def get_results_df(lang_data, mlp_path, sgd_path):
    # Create results data frame with basic info
    results = pd.DataFrame()
    results["Language"] = lang_data["language"]
    results["Family"] = lang_data["family"]
    results["Script"] = lang_data["script"]
    # Read in MLP and SGD results
    mlp_res = pd.read_csv(mlp_path, sep="\t", index_col=0)
    sgd_res = pd.read_csv(sgd_path, sep="\t", index_col=0)
    # Reformat language names & Calculate number of classes
    sgd_res["#Classes"] = sgd_res["Classes"].str.split(",").str.len()
    sgd_res["Language"] = sgd_res["Language"].str.split("_").str.join(" ")
    mlp_res["Language"] = mlp_res["Language"].str.split("_").str.join(" ")
    # Select necessary columns
    sgd_res = sgd_res[['Language', 'SGD Accuracy', 'SGD Balanced Accuracy','SGD Sensitivity', 'Majority Baseline Accuracy', 'Random Baseline Accuracy', '#Classes']]
    mlp_res = mlp_res[['Language', 'MLP Accuracy', 'MLP Balanced Accuracy', 'MLP Sensitivity', 'Random Baseline Accuracy',]]
    # Merge results
    results = pd.merge(left=results, right=sgd_res, on="Language")
    results = pd.merge(left=results, right=mlp_res, on="Language", suffixes=[" SGD", " MLP"])
    # Select and order necessary columns
    results = results[['Language', "Family", "Script", '#Classes',  'Majority Baseline Accuracy', 'Random Baseline Accuracy SGD', 'SGD Accuracy', 'SGD Balanced Accuracy', 'SGD Sensitivity', 'Random Baseline Accuracy MLP', 'MLP Accuracy', 'MLP Balanced Accuracy', 'MLP Sensitivity']]
    # Order data alphabetically 
    results = results.sort_values(by="Language")
    return results

In [4]:
def format_latex(lang_data, results_df, marked_langs):
    latex_df = results_df.copy(deep=True)
    original_lang_str = results_df["Language"]
    in_bert_langs = lang_data[lang_data["bert training data"]]["language"]
    latex_df["Language"] = np.where(
        original_lang_str.isin(marked_langs), 
        original_lang_str.astype(str) + "*", original_lang_str )
    latex_df["Language"] = np.where(
        original_lang_str.isin(in_bert_langs), 
        latex_df["Language"] , 
        r"\textbf{" + latex_df["Language"].astype(str) + "}")
    return latex_df
    

In [5]:
# Determine training size for each language.
data_dir = "preprocessed"
langs = os.listdir(data_dir)
less_than_500 = []
for l in langs:
    path  = os.path.join(data_dir, l)
    file = os.path.join(path, "preprocessed.json")
    data = load_json(file)
    if len(data.train()) < 500:
        less_than_500.append(l)

In [6]:
# Read in language data
lang_data = pd.read_csv("languages.tsv", sep="\t")

## POS Tagging Experiments

In [7]:
out_tsv = "results/upos.tsv"
out_latex = "results/upos.tex"

In [8]:
results_df = get_results_df(
    lang_data,
    mlp_path="results/upos-MLP.tsv",
    sgd_path="results/upos-SGD.tsv")

latex_df = format_latex(lang_data=lang_data, results_df=results_df, marked_langs=less_than_500)

latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
results_df.to_csv(out_tsv, sep="\t")

  latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)


## Tense

In [9]:
out_tsv = "results/tense.tsv"
out_latex = "results/tense.tex"

In [10]:
results_df = get_results_df(
    lang_data,
    mlp_path="results/tense-MLP.tsv",
    sgd_path="results/tense-SGD.tsv")

latex_df = format_latex(lang_data=lang_data, results_df=results_df, marked_langs=less_than_500)

latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
results_df.to_csv(out_tsv, sep="\t")

  latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)


## Case

In [11]:
out_tsv = "results/case.tsv"
out_latex = "results/case.tex"

In [12]:
results_df = get_results_df(
    lang_data,
    mlp_path="results/case-MLP.tsv",
    sgd_path="results/case-SGD.tsv")

latex_df = format_latex(lang_data=lang_data, results_df=results_df, marked_langs=less_than_500)

latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
results_df.to_csv(out_tsv, sep="\t")

  latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)


## Gender

In [13]:
out_tsv = "results/gender.tsv"
out_latex = "results/gender.tex"

In [14]:
results_df = get_results_df(
    lang_data,
    mlp_path="results/gender-MLP.tsv",
    sgd_path="results/gender-SGD.tsv")

latex_df = format_latex(lang_data=lang_data, results_df=results_df, marked_langs=less_than_500)

latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
results_df.to_csv(out_tsv, sep="\t")

  latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)


## Number

In [15]:
out_tsv = "results/number.tsv"
out_latex = "results/number.tex"

In [16]:
results_df = get_results_df(
    lang_data,
    mlp_path="results/number-MLP.tsv",
    sgd_path="results/number-SGD.tsv")

latex_df = format_latex(lang_data=lang_data, results_df=results_df, marked_langs=less_than_500)

latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
results_df.to_csv(out_tsv, sep="\t")

  latex_df.round(decimals=3).to_latex(out_latex, index=False, escape=False)
