In [1]:
from functools import partialmethod
from pathlib import Path
from typing import Tuple

from tqdm import tqdm

from src.data.utils import read_dataset
from src.data.preprocess import documents_to_sentence_annotation

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # disable progress bar

root_path = Path().resolve().parent

2023-05-24 12:50:23.695108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-24 12:50:23.695144: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
corpora = {
    "english": [
        "tempeval_3",
        "tcr",
        "ancient_time_english",
        "wikiwars",
        "ph_english"
    ],
    "portuguese": [
        "timebankpt",
        "ph_portuguese"
    ],
    "spanish": [
        "spanish_timebank",
        "traint3",
        "ancient_time_spanish",
        "ph_spanish"
    ],
    "italian": [
        "narrative_container",
        "ancient_time_italian",
        "ph_italian"
    ],
    "french": [
        "fr_timebank",
        "ancient_time_french",
        "ph_french"
    ],
    "german": [
        "krauts",
        "wikiwars_de",
        "ancient_time_german",
        "ph_german"
    ],
}

In [3]:
def get_sentences_timexs_count(documents) -> Tuple[int, int]:
    n_sents, n_tmxs = 0, 0
    annotated_sentences = documents_to_sentence_annotation(documents)
    for _, tmx in annotated_sentences:
        if tmx:
            n_sents += 1
            n_tmxs += len(tmx)
    return n_sents, n_tmxs

In [4]:
header = [" ", "Train", " ", " ", "Validation", " ", " ", "Test", " ", " "]
print(f"{header[0]:<30} & {header[1]:<8} & {header[2]:<8} & {header[3]:<8} & {header[4]:<8} & {header[5]:<8} & {header[6]:<8} & {header[7]:<8} & {header[8]:<8} & {header[9]:<8}")
header = [" ", "#Docs", "#Sents", "#Timexs", "#Docs", "#Sents", "#Timexs", "#Docs", "#Sents", "#Timexs"]
print(f"{header[0]:<30} & {header[1]:<8} & {header[2]:<8} & {header[3]:<8} & {header[4]:<8} & {header[5]:<8} & {header[6]:<8} & {header[7]:<8} & {header[8]:<8} & {header[9]:<8}")

for language in corpora:
    for corpus in corpora[language]:
        train_docs, val_docs, test_docs = read_dataset(corpus, root_path / "data" / "raw")

        n_train_docs = len(train_docs)
        n_train_sents, n_train_tmxs = get_sentences_timexs_count(train_docs)

        n_val_docs = len(val_docs)
        n_val_sents, n_val_tmxs = get_sentences_timexs_count(val_docs)

        n_test_docs = len(test_docs)
        n_test_sents, n_test_tmxs = get_sentences_timexs_count(test_docs)

        print(f"{corpus:<30} & "
              f"{n_train_docs:<8} & {n_train_sents:<8} & {n_train_tmxs:<8} & "
              f"{n_val_docs:<8} & {n_val_sents:<8} & {n_val_tmxs:<8} & "
              f"{n_test_docs:<8} & {n_test_sents:<8} & {n_test_tmxs:<8}")

                               & Train    &          &          & Validation &          &          & Test     &          &         
                               & #Docs    & #Sents   & #Timexs  & #Docs    & #Sents   & #Timexs  & #Docs    & #Sents   & #Timexs 
tempeval_3                     & 204      & 1183     & 1472     & 51       & 273      & 338      & 20       & 106      & 138     
tcr                            & 16       & 105      & 126      & 4        & 21       & 29       & 5        & 50       & 62      
ancient_time_english           & 3        & 94       & 142      & 1        & 92       & 125      & 1        & 31       & 39      
wikiwars                       & 16       & 1594     & 2166     & 2        & 90       & 117      & 4        & 265      & 357     
ph_english                     & 17743    & 128812   & 165385   & 1971     & 14488    & 18469    & 4928     & 35945    & 46307   
timebankpt                     & 130      & 733      & 911      & 32       & 141      & 