# HIPE 2022 dataset statistics

# Imports

In [None]:
import os
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset, compute_entities_stats

# Functions

In [None]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# HIPE 2022 Datasets

In [None]:
RELEASE_VERSION = "v2.1"
HIPE2022_data_path = f"../data/{RELEASE_VERSION}/"

## ajmc

See the [README file](./documentation/README-ajmc.md) for detailed information about this dataset.

### File paths

In [None]:
# EN
ajmc_train_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-train-en.tsv")
ajmc_dev_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-en.tsv")
ajmc_test_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-test-en.tsv")

# DE
ajmc_train_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-train-de.tsv")
ajmc_dev_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-de.tsv")
ajmc_test_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-test-de.tsv")

# FR
ajmc_train_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-train-fr.tsv")
ajmc_dev_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-fr.tsv")
ajmc_test_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-test-fr.tsv")

### ajmc EN

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_en_docs = parse_tsv(file_path=ajmc_dev_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_en_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_en_docs = parse_tsv(file_path=ajmc_train_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_en_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_en_docs = parse_tsv(file_path=ajmc_test_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_en_docs))

### ajmc DE

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_de_docs = parse_tsv(file_path=ajmc_dev_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_de_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_de_docs = parse_tsv(file_path=ajmc_train_de_path)


In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_de_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_de_docs = parse_tsv(file_path=ajmc_test_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_de_docs))

### ajmc FR

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_fr_docs = parse_tsv(file_path=ajmc_dev_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_fr_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_fr_docs = parse_tsv(file_path=ajmc_train_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_fr_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_fr_docs = parse_tsv(file_path=ajmc_test_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_fr_docs))

## hipe2020

See the [README file](./documentation/README-hipe2020.md) for detailed information about this dataset.

### File paths

In [None]:
# EN
hipe2020_dev_en_path = os.path.join(HIPE2022_data_path, f"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-en.tsv")
hipe2020_test_en_path = os.path.join(HIPE2022_data_path, f"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-en.tsv")

# DE
hipe2020_train_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-de.tsv")
hipe2020_dev_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-de.tsv")
hipe2020_test_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-de.tsv")


# FR
hipe2020_dev_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-fr.tsv")
hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-fr.tsv")
hipe2020_test_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-fr.tsv")

### hipe2020 EN

#### Dev


In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_en_docs = parse_tsv(file_path=hipe2020_dev_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_en_docs))

#### Test


In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_en_docs = parse_tsv(file_path=hipe2020_test_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_en_docs))

### hipe2020 DE

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_de_docs = parse_tsv(file_path=hipe2020_dev_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_de_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_de_docs = parse_tsv(file_path=hipe2020_train_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_de_docs))

#### Test


In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_de_docs = parse_tsv(file_path=hipe2020_test_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_de_docs))

### hipe2020 FR

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_fr_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_fr_docs = parse_tsv(file_path=hipe2020_train_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_fr_docs))

#### Test


In [None]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_fr_docs = parse_tsv(file_path=hipe2020_test_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_fr_docs))

## letemps

See the [README file](./documentation/README-letemps.md) for detailed information about this dataset.

### File paths

In [None]:
letemps_dev_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-dev-fr.tsv")
letemps_train_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-train-fr.tsv")
letemps_test_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-test-fr.tsv")

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
letemps_dev_fr_docs = parse_tsv(file_path=letemps_dev_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_dev_fr_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
letemps_train_fr_docs = parse_tsv(file_path=letemps_train_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_train_fr_docs))

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
letemps_test_fr_docs = parse_tsv(file_path=letemps_test_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_test_fr_docs))

## topRes19th

See the [README file](./documentation/README-topres19th.md) for detailed information about this dataset.

### File paths

In [None]:
topRes19th_dev_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-dev-en.tsv")
topRes19th_train_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-train-en.tsv")
topRes19th_test_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-test-en.tsv")

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_dev_en_docs = parse_tsv(file_path=topRes19th_dev_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_dev_en_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_train_en_docs = parse_tsv(file_path=topRes19th_train_en_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_train_en_docs))

#### Test

(lines below are to be commented out once the test files for topRes are published)

In [None]:
# parse the TSV into a list of `HipeDocument` objects
#topRes19th_test_en_docs = parse_tsv(file_path=topRes19th_test_en_path)

In [None]:
# print some basic stats for the TSV dataset 
#print(describe_dataset(documents=topRes19th_dev_en_docs))

## newseye

See the [README file](./documentation/README-newseye.md) for detailed information about this dataset.

### File paths

In [None]:
# FR
newseye_dev_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fr.tsv")
newseye_train_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-train-fr.tsv")
newseye_test_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-test-fr.tsv")

# DE
newseye_dev_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-dev-de.tsv")
newseye_train_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-train-de.tsv")
newseye_test_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-test-de.tsv")

# FI
newseye_dev_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fi.tsv")
newseye_train_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-train-fi.tsv")
newseye_test_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-test-fi.tsv")

# SV
newseye_dev_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-dev-sv.tsv")
newseye_train_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-train-sv.tsv")
newseye_test_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-test-sv.tsv")

### newseye FR

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fr_docs = parse_tsv(file_path=newseye_dev_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fr_docs))

#### Train

In [None]:
newseye_train_fr_path

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fr_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_fr_docs = parse_tsv(file_path=newseye_test_fr_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_fr_docs))

### newseye DE

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_de_docs = parse_tsv(file_path=newseye_dev_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_de_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_de_docs = parse_tsv(file_path=newseye_train_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_de_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_de_docs = parse_tsv(file_path=newseye_test_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_de_docs))

### newseye FI

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fi_docs = parse_tsv(file_path=newseye_dev_fi_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fi_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fi_docs = parse_tsv(file_path=newseye_train_fi_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fi_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_fi_docs = parse_tsv(file_path=newseye_test_fi_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_fi_docs))

### newseye SV

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_sv_docs = parse_tsv(file_path=newseye_dev_sv_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_sv_docs))

#### Train

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_sv_docs = parse_tsv(file_path=newseye_train_sv_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_sv_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_sv_docs = parse_tsv(file_path=newseye_test_sv_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_sv_docs))

## sonar

See the [README file](./documentation/README-sonar.md) for detailed information about this dataset.

### File paths

In [None]:
sonar_dev_de_path = os.path.join(HIPE2022_data_path, f"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-dev-de.tsv")
sonar_test_de_path = os.path.join(HIPE2022_data_path, f"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-test-de.tsv")

#### Dev

In [None]:
# parse the TSV into a list of `HipeDocument` objects
sonar_dev_de_docs = parse_tsv(file_path=sonar_dev_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_dev_de_docs))

#### Test

In [None]:
# parse the TSV into a list of `HipeDocument` objects
sonar_test_de_docs = parse_tsv(file_path=sonar_test_de_path)

In [None]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_test_de_docs))