In [84]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import sys
sys.path.append("..")
from data_preparation.data_preparation_pos import read_conll
import utils.utils as utils
import utils.pos_utils as pos_utils

### PoS

In [89]:
def pos_baseline(file_path, lang_name, lang_code, dataset, baselines):
    if dataset == "test":
        conllu_data = read_conll(file_path)
        tags = [tag for taglist in conllu_data[2] for tag in taglist]
        # Accuracy will be the relative frequency of the majority tag
        acc = tags.count(max(set(tags), key=tags.count)) / len(tags)
        baselines.append((lang_name, acc))
    return baselines # Return even if it's unaltered, otherwise it gets overwritten by None

In [90]:
baselines = utils.run_through_data("../data/ud/", pos_baseline, [])

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [91]:
baselines = pd.DataFrame(np.array(baselines), columns=["Language", "Accuracy"])
baselines["Accuracy"] = pd.to_numeric(baselines["Accuracy"])
baselines = utils.order_table(baselines)
baselines

Unnamed: 0,Language,Accuracy
0,Bulgarian,0.221699
1,English,0.190719
2,Russian,0.253061
3,Slovak,0.252149
4,Croatian,0.254246
5,Chinese,0.252627
6,Vietnamese,0.321037
7,Thai,0.271123
8,Finnish,0.26689
9,Basque,0.248503


In [92]:
pos_baselines_path = "../results/baselines_pos.xlsx"

with pd.ExcelWriter(pos_baselines_path) as writer:
    for metric in baselines.columns[1:]:
        baselines[["Language", metric]].rename(columns={metric: "Baseline"}).to_excel(writer, index=False, sheet_name=metric)

### Sentiment

In [102]:
def sentiment_baseline(file_path, lang_name, lang_code, dataset, baselines):
    if dataset == "test":
        data = pd.read_csv(file_path, header=None)
        data.columns = ["sentiment", "review"]
        # Prediction will be the majority class
        y_true = data["sentiment"].values
        y_pred = [data["sentiment"].mode()[0]] * len(y_true)
        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
        recall = recall_score(y_true, y_pred, average="macro")
        f1 = f1_score(y_true, y_pred, average="macro")
        baselines.append((lang_name, acc, precision, recall, f1))
    return baselines

In [103]:
baselines = utils.run_through_data("../data/sentiment/", sentiment_baseline, [])

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




In [104]:
baselines = pd.DataFrame(np.array(baselines), columns=["Language", "Accuracy", "Macro_Precision", "Macro_Recall", "Macro_F1"])
baselines.iloc[:, 1:] = baselines.iloc[:, 1:].astype(float)
baselines = utils.order_table(baselines)
baselines

Unnamed: 0,Language,Accuracy,Macro_Precision,Macro_Recall,Macro_F1
0,Bulgarian,0.79737,0.398685,0.5,0.443632
1,English,0.500824,0.250412,0.5,0.333699
2,Russian,0.67128,0.33564,0.5,0.401656
3,Slovak,0.87688,0.43844,0.5,0.467201
4,Croatian,0.78032,0.39016,0.5,0.438303
5,Chinese,0.604571,0.302286,0.5,0.37678
6,Vietnamese,0.513869,0.256934,0.5,0.339441
7,Thai,0.59215,0.296075,0.5,0.371919
8,Finnish,0.745592,0.372796,0.5,0.427128
9,Basque,0.845815,0.422907,0.5,0.458234


In [105]:
sentiment_baselines_path = "../results/baselines_sentiment.xlsx"

with pd.ExcelWriter(sentiment_baselines_path) as writer:
    for metric in baselines.columns[1:]:
        baselines[["Language", metric]].rename(columns={metric: "Baseline"}).to_excel(writer, index=False, sheet_name=metric)