In [517]:
from dataclasses import dataclass
from typing import List
import pandas as pd


@dataclass
class WandbSource:
    project: str
    runs: List[str]
    split: str


@dataclass
class DFSource:
    name: str
    data: pd.DataFrame
    runs: List[str]

In [518]:
%load_ext autoreload
%autoreload 2
tasks = ["server", "category", "authors_cum_gender", "day_of_week"]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [519]:
from pathlib import Path
from human_extract import get_humans_predictions, get_excel_features, get_preds_f1, get_true
predictions_humans = get_humans_predictions(Path("../HUMAN_EVAL"), "True.csv", get_excel_features( "../HUMAN_EVAL/True.csv"))
true = {task: get_true(task) for task in tasks}
human_f1 = get_preds_f1(predictions_humans, true)
# add rows named Average
human_f1.loc["Human_Average"] = human_f1.mean()


{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20]}
{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20]}
{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20]}
{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20]}
{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20]}
{'server': [0, 1, 2, 3, 4, 5], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6], 'category': [0, 1, 2, 3, 6, 7, 9,

In [520]:
# final
import pickle
import torch
predictions_final = {task: pickle.load(open(f"../predictions/test_human/{task}_predictions.pkl", "rb")) for task in tasks}

predictions_final = {task: torch.cat(pred).argmax(dim=1).tolist() for task, pred in predictions_final.items()}

final_f1 = get_preds_f1({"final": predictions_final}, true)


{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}
{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}


In [521]:
from datasets import load_dataset
import numpy as np
from functools import cache
from human_extract import get_f1

@cache
def get_trivial_baseline_predictions(task, split):
    dst = np.array(load_dataset("hynky/czech_news_dataset", split="train")[task])
    dst = dst[dst != 0] - 1
    most_common = np.bincount(dst).argmax()
    test_dst = np.array(load_dataset("hynky/czech_news_dataset", split=split)[task])
    test_dst = test_dst[test_dst != 0] - 1
    return [most_common] * len(test_dst)


def get_trivial_baseline_f1(splits, tasks):
    rows = []
    for split in splits:
        preds = {task: get_trivial_baseline_predictions(task, split) for task in tasks}
        true = {task: get_true(task, split=split) for task in tasks}
        f1_macro = get_f1(preds, true, split=split)
        f1_micro = get_f1(preds, true, split=split, average="micro")
        combined_f1 = list(f1_macro.values())
        for i,f in enumerate(f1_micro.values()):
            combined_f1.insert(i*2+1, f)
        rows.append(combined_f1)

    index = [f"{split}_trivial" for split in splits]
    return pd.DataFrame(rows, columns=pd.MultiIndex.from_product([tasks, [f"{split}/f1_macro", f"{split}/f1_micro"]]), index=index)










In [522]:
trivial_f1_small = get_trivial_baseline_f1(["test_small"], tasks)
trivial_f1 = get_trivial_baseline_f1(["test"], tasks)
trivial_f1_human = get_trivial_baseline_f1(["test_human"], tasks)

Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934

{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}
{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}


Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934

{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}
{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}


Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (/home/kydliceh/.cache/huggingface/datasets/hynky___parquet/hynky--czech_news_dataset-7dfdf4ade67b74c3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934

{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}
{'server': [0, 1, 2, 3, 4, 5], 'category': [0, 1, 2, 3, 6, 7, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20], 'authors_cum_gender': [0, 1, 2], 'day_of_week': [0, 1, 2, 3, 4, 5, 6]}


In [523]:
trivial_f1

Unnamed: 0_level_0,server,server,category,category,authors_cum_gender,authors_cum_gender,day_of_week,day_of_week
Unnamed: 0_level_1,test/f1_macro,test/f1_micro,test/f1_macro,test/f1_micro,test/f1_macro,test/f1_micro,test/f1_macro,test/f1_micro
test_trivial,0.105289,0.461706,0.021979,0.378801,0.258317,0.632588,0.041051,0.167784


In [524]:
final_f1

Unnamed: 0_level_0,server,server,category,category,authors_cum_gender,authors_cum_gender,day_of_week,day_of_week
Unnamed: 0_level_1,test_human/f1_macro,test_human/f1_micro,test_human/f1_macro,test_human/f1_micro,test_human/f1_macro,test_human/f1_micro,test_human/f1_macro,test_human/f1_micro
final,0.712198,0.8,0.520437,0.795918,0.527895,0.79,0.283651,0.29


In [525]:
from gpt_extract import get_predictions, get_expected
from sklearn.metrics import f1_score
import pandas as pd

gpt_predictions = get_predictions("../GPT3/gpt3-results.jsonl")
gpt_expected = get_expected("../GPT3/gpt3-results.jsonl")
# compute f1 macro and micro based on preds df and expected df
metrics = {}
for task in tasks:
    metrics[(task, "test_small/f1_macro")] = f1_score(gpt_expected[task], gpt_predictions[task], average="macro")
    metrics[(task, "test_small/f1_micro")] = f1_score(gpt_expected[task], gpt_predictions[task], average="micro")

gpt_f1 = pd.DataFrame(metrics, index=["gpt3"])

In [526]:
gpt_f1

Unnamed: 0_level_0,authors_cum_gender,authors_cum_gender,server,server,day_of_week,day_of_week,category,category
Unnamed: 0_level_1,test_small/f1_macro,test_small/f1_micro,test_small/f1_macro,test_small/f1_micro,test_small/f1_macro,test_small/f1_micro,test_small/f1_macro,test_small/f1_micro
gpt3,0.429163,0.7028,0.672993,0.7312,0.194891,0.2083,0.447623,0.7521


In [527]:

import wandb
import numpy as np

def get_dict_result(run_name, metrics: dict, metric):
    query_str = metric

    if query_str not in metrics:
        query_str += "_epoch"

    if query_str not in metrics:
        print(f"WARNING: {query_str} not found in {run_name}")
        return None
    
    return metrics[query_str]

def get_df_result(row, metric):
    print(row)
    return row[metric].iloc[-1]


    
def get_results(runs: dict, metric):
    results = []
    for run_name, metrics in runs.items():
            result = None
            if isinstance(metrics, dict):
                result = get_dict_result(run_name, metrics, metric)
            elif isinstance(metrics, pd.Series):
                result = get_df_result(metrics, metric)
            elif metrics is None:
                result = np.nan
            else:
                raise ValueError(f"Unknown type {type(metrics)}")
            results.append(result)
    print(results)
    return results

def get_project_name(task, ml_type):
    proj = "hynky/{task}-{ml_type}"
    return proj.format(task=task.capitalize(), ml_type=ml_type)


def get_wandb_runs(names, project_name):
    api = wandb.Api()
    filters = [{"display_name": name} for name in names]
    runs = list(api.runs(project_name, filters={"$or": filters}))
    runs = {run.display_name: dict(run.summary) for run in runs}
    # sort dictoinary by names
    if len(runs) != len(names):
        print(f"WARNING: {len(runs)} runs found, but {len(names)} expected for {project_name}")
        missing = set(names) - set(runs.keys())
        print(f"MISSING: {missing}")
    runs = {name: runs.get(name) for name in names}
    return runs

def get_df_runs(names, df, task):
    # return dict with names as keys and rows as values
    df_tmp = df[task]
    return {name: df_tmp.loc[name].to_dict() for name in names}


def get_runs(source, task):
     if isinstance(source, WandbSource):
         return get_wandb_runs(source.runs, get_project_name(task, source.project))
     elif isinstance(source, DFSource):
         return get_df_runs(source.runs, source.data, task)
    

        




In [580]:
import pandas as pd


metrics_dict = {
    "test/f1_micro": "F1-micro",
    "test/f1_macro": "F1-macro",
    "test_small/f1_micro": "F1-micro",
    "test_small/f1_macro": "F1-macro",
    "test_human/f1_micro": "F1-micro",
    "test_human/f1_macro": "F1-macro",
}

models_dict = {
    "RobeCzech-Base": "R-Base",
    "RobeCzech-Short": "R-Short",
    "Fernet-Base": "F-Base",
    "Fernet-Short": "F-Short",
    "test_small_trivial": "Baseline",
    "test_trivial": "Baseline",
    "test_human_trivial": "Baseline",
    "Human_Average": "Human",
    "gpt3": "GPT-3",
    "final": "Final",

}

task_dict = {
    "server": "Server",
    "category": "Category",
    "authors_cum_gender": "Gender",
    "day_of_week": "Day of week",
}

def create_table(projs, tasks, metrics):
    columns = {}
    model_names = []
    for task in tasks:
        task_read = task_dict.get(task, task)
        project_runs = [get_runs(proj, task) for proj in projs]
        runs = {}
        for run in project_runs:
            runs = {**runs, **run}
        model_names = [models_dict.get(name, name) for name in runs.keys()]
        for metric in metrics:
            columns[(task_read, metrics_dict[metric])] = get_results(runs, metric)
    # do metrics as multiindex
    metrics_df = pd.DataFrame(columns)
    metrics_df["Model"] = model_names
    # set model names as index
    return metrics_df.set_index("Model")
    

In [586]:

tables = {
    "tuning":[
    ],
    "basic": [
        DFSource("Trivial-Baseline", trivial_f1, ["test_trivial"]),
        WandbSource("ML", ["LR-50", "LR-200"], split="test"),
        WandbSource("Deep-Learning", ["RobeCzech-Base", "Fernet-Base", "RobeCzech-Short"], split="test"),
        WandbSource("Deep-Learning",["RobeCzech-Base", "Truncate", "LM-tune", "Grad-12", "Grad-24"], split="test"),
        WandbSource("Deep-Learning",["Final"], split="test")
    ],
    "short": [
        DFSource("Trivial-Baseline", trivial_f1_small , ["test_small_trivial"]),
        WandbSource("Deep-Learning", ["RobeCzech-Base", "RobeCzech-Short", "Fernet-Short"], split="test_small"),
        DFSource("GPT-3", gpt_f1, ["gpt3"]),
    ],
    "human": [
        DFSource("Trivial-Baseline", trivial_f1_human , ["test_human_trivial"]),
        DFSource("Human", human_f1, ["Human_Average"]),
        DFSource("Final", final_f1, ["final"]),
    ]
}

In [587]:
metrics = ["test_human/f1_macro", "test_human/f1_micro"]
tasks = ["server", "category", "authors_cum_gender", "day_of_week"]
tb = create_table(tables["human"], tasks, metrics)

[0.09523809523809525, 0.2960973134072549, 0.7121976609645749]
[0.4000000000000001, 0.3333333333333333, 0.8000000000000002]
[0.028846153846153844, 0.4255302732421377, 0.5204365381271194]
[0.3, 0.6142734085623224, 0.7959183673469388]
[0.24203821656050958, 0.5193677455741007, 0.5278945892907205]
[0.57, 0.6033333333333334, 0.79]
[0.037267080745341616, 0.11789189687403472, 0.2836507511071971]
[0.15, 0.12333333333333334, 0.29]


In [588]:
import numpy as np
# convert float to percentage with 2 decimal places
def as_latex(tb):
    # fillna not working for some reason
    # highlight best results in column by bolding
    tb = tb.apply(lambda x: x.apply(lambda y: "\\textbf{" + "{:.2f}".format(y*100)  + "}" if y == x.max() else "{:.2f}".format(y*100)))
    tb = tb.replace(np.nan, "-", regex=True)
    tb = tb.replace("nan", "-", regex=True)
    return tb.to_latex(escape=False)

    







In [589]:
tex_tables = []
for table in tables:
    metrics = ["f1_macro", "f1_micro"]
    if table == "human":
        metrics = [f"test_human/{metric}" for metric in metrics]
    elif table == "short":
        metrics = [f"test_small/{metric}" for metric in metrics]
    else:
        metrics = [f"test/{metric}" for metric in metrics]

    tb = create_table(tables[table], tasks, metrics)
    tex_tables.append(as_latex(tb))

for table in tex_tables:
    print(table)

[]
[]
[]
[]
[]
[]
[]
[]


  return tb.to_latex(escape=False)


[0.10528931487397941, 0.36919441638172473, 0.3727358682336947, 0.6974121332168579, 0.6938974857330322, 0.5948003530502319, 0.687109112739563, 0.7005615234375, 0.6780757904052734, 0.6907496452331543, 0.7103885412216187]
[0.46170610211706103, 0.5277577505407354, 0.533787769548404, 0.7819361686706543, 0.7768155336380005, 0.6785491704940796, 0.7731205224990845, 0.7840335369110107, 0.7636330723762512, 0.7769466638565063, 0.792488694190979]
[0.021978563942684527, 0.3330021459068112, 0.3277034028310185, 0.5435124635696411, 0.5396564602851868, 0.3655204474925995, 0.5389659404754639, 0.5518217086791992, 0.5192826986312866, 0.5321670770645142, 0.5605653524398804]
[0.3788007577229454, 0.7232489799883427, 0.7268797357684088, 0.7966898083686829, 0.7955119609832764, 0.7745652794837952, 0.793726921081543, 0.8013527393341064, 0.7868661284446716, 0.789707601070404, 0.8046920299530029]
MISSING: {'Fernet-Base'}
[0.25831715019910156, 0.43624629590607594, 0.4405715709235813, 0.5118262767791748, nan, 0.4496

  return tb.to_latex(escape=False)


[0.08981102669004482, 0.7842721343040466, 0.7511593103408813, 0.08981102705001831, 0.6729932163405395]
[0.3688, 0.8288000226020813, 0.8036999702453613, 0.36880001425743103, 0.7312]
[0.0200793948018875, 0.5617374777793884, 0.3788470327854157, 0.3930864930152893, 0.44762313786172636]
[0.3351, 0.8065000176429749, 0.7768999934196472, 0.7788000106811523, 0.7520999999999999]
MISSING: {'Fernet-Short'}
[0.2663624354509427, 0.5238218903541565, 0.4745028018951416, nan, 0.4291626476848629]
[0.6654, 0.7560999989509583, 0.7429999709129333, nan, 0.7028]
[0.041430525698651556, 0.2796197533607483, 0.1740679144859314, 0.1768307983875275, 0.19489117748111326]
[0.1696, 0.2833000123500824, 0.19820000231266025, 0.19200000166893005, 0.2083]
[0.09523809523809525, 0.2960973134072549, 0.7121976609645749]
[0.4000000000000001, 0.3333333333333333, 0.8000000000000002]
[0.028846153846153844, 0.4255302732421377, 0.5204365381271194]
[0.3, 0.6142734085623224, 0.7959183673469388]
[0.24203821656050958, 0.519367745574100

  return tb.to_latex(escape=False)
  return tb.to_latex(escape=False)
