# HIPE 2022 dataset statistics

# Imports

In [1]:
import os
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset, compute_entities_stats

# Functions

In [2]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# HIPE 2022 Datasets

In [3]:
RELEASE_VERSION = "v2.0"
HIPE2022_data_path = f"./data/{RELEASE_VERSION}/"

## ajmc

See the [README file](./documentation/README-ajmc.md) for detailed information about this dataset.

### File paths

In [4]:
# EN
ajmc_train_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-train-en.tsv")
ajmc_dev_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-en.tsv")

# DE
ajmc_train_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-train-de.tsv")
ajmc_dev_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-de.tsv")

# FR
ajmc_train_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-train-fr.tsv")
ajmc_dev_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-fr.tsv")

### ajmc EN

#### Dev

In [5]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_en_docs = parse_tsv(file_path=ajmc_dev_en_path)

In [6]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_en_docs))


Path of the TSV file: ./data/v2.0/ajmc/en/HIPE-2022-v2.0-ajmc-dev-en.tsv 
Number of documents: 14 
Number of entities: {'coarse_lit': 413, 'fine_lit': 413} 
Number of tokens: 6507 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       5 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |     130 |
+-------+---------+
| scope |     160 |
+-------+---------+
| work  |     115 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       5 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      69 |
+--------------+---------+
| pers.editor  |       9 |
+--------------+---------+
| pers.myth    |      50 |
+--------------+---------+
| pers.other   |       2 |
+--------------+---------+
| scope        |     160 |
+--------------+---------+
| work.fragm   |       1 |
+--------------+---------+
| work.primlit |     101 |
+--------------+---

#### Train

In [7]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_en_docs = parse_tsv(file_path=ajmc_train_en_path)

In [8]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_en_docs))


Path of the TSV file: ./data/v2.0/ajmc/en/HIPE-2022-v2.0-ajmc-train-en.tsv 
Number of documents: 60 
Number of entities: {'coarse_lit': 1815, 'fine_lit': 1815, 'nested': 4} 
Number of tokens: 30936 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |      12 |
+--------+---------+
| loc    |      37 |
+--------+---------+
| object |       3 |
+--------+---------+
| pers   |     622 |
+--------+---------+
| scope  |     676 |
+--------+---------+
| work   |     465 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |      12 |
+----------------+---------+
| loc            |      37 |
+----------------+---------+
| object.manuscr |       3 |
+----------------+---------+
| pers.author    |     239 |
+----------------+---------+
| pers.editor    |      28 |
+----------------+---------+
| pers.myth      |     333 |
+----------------+---------+
| pers.other     |      22 |
+----------------+-------

### ajmc DE

#### Dev

In [9]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_de_docs = parse_tsv(file_path=ajmc_dev_de_path)

In [10]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_de_docs))


Path of the TSV file: ./data/v2.0/ajmc/de/HIPE-2022-v2.0-ajmc-dev-de.tsv 
Number of documents: 14 
Number of entities: {'coarse_lit': 403, 'fine_lit': 403, 'nested': 2} 
Number of tokens: 4702 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| loc    |      10 |
+--------+---------+
| object |       4 |
+--------+---------+
| pers   |     166 |
+--------+---------+
| scope  |     157 |
+--------+---------+
| work   |      66 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      10 |
+----------------+---------+
| object.manuscr |       4 |
+----------------+---------+
| pers.author    |      70 |
+----------------+---------+
| pers.editor    |       7 |
+----------------+---------+
| pers.myth      |      86 |
+----------------+---------+
| pers.other     |       3 |
+----------------+---------+
| scope          |     157 |
+----------------+---------+
| work.primlit   |      66 |
+-------------

#### Train

In [11]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_de_docs = parse_tsv(file_path=ajmc_train_de_path)


In [12]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_de_docs))


Path of the TSV file: ./data/v2.0/ajmc/de/HIPE-2022-v2.0-ajmc-train-de.tsv 
Number of documents: 76 
Number of entities: {'coarse_lit': 1738, 'fine_lit': 1738, 'nested': 11} 
Number of tokens: 22696 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |       2 |
+--------+---------+
| loc    |      31 |
+--------+---------+
| object |       6 |
+--------+---------+
| pers   |     619 |
+--------+---------+
| scope  |     758 |
+--------+---------+
| work   |     322 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |       2 |
+----------------+---------+
| loc            |      31 |
+----------------+---------+
| object.manuscr |       6 |
+----------------+---------+
| pers.author    |     352 |
+----------------+---------+
| pers.editor    |      16 |
+----------------+---------+
| pers.myth      |     235 |
+----------------+---------+
| pers.other     |      16 |
+----------------+------

### ajmc FR

#### Dev

In [13]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_fr_docs = parse_tsv(file_path=ajmc_dev_fr_path)

In [14]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_fr_docs))


Path of the TSV file: ./data/v2.0/ajmc/fr/HIPE-2022-v2.0-ajmc-dev-fr.tsv 
Number of documents: 17 
Number of entities: {'coarse_lit': 390, 'fine_lit': 390} 
Number of tokens: 5425 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| pers  |     123 |
+-------+---------+
| scope |     168 |
+-------+---------+
| work  |      99 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| pers.author  |      52 |
+--------------+---------+
| pers.editor  |      13 |
+--------------+---------+
| pers.myth    |      23 |
+--------------+---------+
| pers.other   |      35 |
+--------------+---------+
| scope        |     168 |
+--------------+---------+
| work.primlit |      99 |
+--------------+---------+



#### Train

In [15]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_fr_docs = parse_tsv(file_path=ajmc_train_fr_path)

In [16]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_fr_docs))


Path of the TSV file: ./data/v2.0/ajmc/fr/HIPE-2022-v2.0-ajmc-train-fr.tsv 
Number of documents: 72 
Number of entities: {'coarse_lit': 1621, 'fine_lit': 1621, 'nested': 8} 
Number of tokens: 24669 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |       2 |
+--------+---------+
| loc    |      15 |
+--------+---------+
| object |      10 |
+--------+---------+
| pers   |     576 |
+--------+---------+
| scope  |     639 |
+--------+---------+
| work   |     379 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |       2 |
+----------------+---------+
| loc            |      15 |
+----------------+---------+
| object.manuscr |      10 |
+----------------+---------+
| pers.author    |     293 |
+----------------+---------+
| pers.editor    |      57 |
+----------------+---------+
| pers.myth      |      83 |
+----------------+---------+
| pers.other     |     143 |
+----------------+-------

## hipe2020

See the [README file](./documentation/README-hipe2020.md) for detailed information about this dataset.

### File paths

In [17]:
# EN
hipe2020_dev_en_path = os.path.join(HIPE2022_data_path, f"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-en.tsv")

# DE
hipe2020_train_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-de.tsv")
hipe2020_dev_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-de.tsv")

# FR
hipe2020_dev_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-fr.tsv")
hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-fr.tsv")

### hipe2020 EN

#### Dev


In [18]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_en_docs = parse_tsv(file_path=hipe2020_dev_en_path)

In [19]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_en_docs))


Path of the TSV file: ./data/v2.0/hipe2020/en/HIPE-2022-v2.0-hipe2020-dev-en.tsv 
Number of documents: 80 
Number of entities: {'coarse_lit': 966, 'coarse_meto': 15} 
Number of tokens: 29063 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     384 |
+------+---------+
| org  |     118 |
+------+---------+
| pers |     402 |
+------+---------+
| prod |      33 |
+------+---------+
| time |      29 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       6 |
+-----+---------+
| org |       9 |
+-----+---------+



### hipe2020 DE

#### Dev

In [20]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_de_docs = parse_tsv(file_path=hipe2020_dev_de_path)

In [21]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_de_docs))


Path of the TSV file: ./data/v2.0/hipe2020/de/HIPE-2022-v2.0-hipe2020-dev-de.tsv 
Number of documents: 33 
Number of entities: {'coarse_lit': 1242, 'coarse_meto': 91, 'fine_lit': 1242, 'fine_meto': 91, 'fine_comp': 468, 'nested': 67} 
Number of tokens: 32671 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     588 |
+------+---------+
| org  |     164 |
+------+---------+
| pers |     372 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      69 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      12 |
+------+---------+
| org  |      78 |
+------+---------+
| prod |       1 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.adm.nat         |     135 |
+---------------------+---------+
| loc.adm.reg         |      53 |
+---------------------+---------+
| loc.adm.sup         |       8 |
+---------------------+---------+
| loc.adm.town        |     3

#### Train

In [22]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_de_docs = parse_tsv(file_path=hipe2020_train_de_path)

In [23]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_de_docs))


Path of the TSV file: ./data/v2.0/hipe2020/de/HIPE-2022-v2.0-hipe2020-train-de.tsv 
Number of documents: 103 
Number of entities: {'coarse_lit': 3494, 'coarse_meto': 325, 'fine_lit': 3494, 'fine_meto': 325, 'fine_comp': 1436, 'nested': 158} 
Number of tokens: 86445 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    1740 |
+------+---------+
| org  |     358 |
+------+---------+
| pers |    1166 |
+------+---------+
| prod |     112 |
+------+---------+
| time |     118 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      17 |
+------+---------+
| org  |     306 |
+------+---------+
| pers |       2 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.add.phys        |       2 |
+---------------------+---------+
| loc.adm.nat         |     563 |
+---------------------+---------+
| loc.adm.reg         |     199 |
+---------------------+---------+
| loc.adm.sup         

### hipe2020 FR

#### Dev

In [24]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)

In [25]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_fr_docs))


Path of the TSV file: ./data/v2.0/hipe2020/fr/HIPE-2022-v2.0-hipe2020-dev-fr.tsv 
Number of documents: 43 
Number of entities: {'coarse_lit': 1729, 'coarse_meto': 108, 'fine_lit': 1729, 'fine_meto': 108, 'fine_comp': 724, 'nested': 91} 
Number of tokens: 37953 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     774 |
+------+---------+
| org  |     159 |
+------+---------+
| pers |     679 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      68 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       3 |
+-----+---------+
| org |     105 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.add.phys           |       1 |
+------------------------+---------+
| loc.adm.nat            |     258 |
+------------------------+---------+
| loc.adm.reg            |      73 |
+------------------------+---------+
| loc.adm.sup            |      27 |
+---------

#### Train

In [26]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_fr_docs = parse_tsv(file_path=hipe2020_train_fr_path)

In [27]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_fr_docs))


Path of the TSV file: ./data/v2.0/hipe2020/fr/HIPE-2022-v2.0-hipe2020-train-fr.tsv 
Number of documents: 158 
Number of entities: {'coarse_lit': 6926, 'coarse_meto': 451, 'fine_lit': 6926, 'fine_meto': 451, 'fine_comp': 3051, 'nested': 473} 
Number of tokens: 166220 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| comp |       1 |
+------+---------+
| loc  |    3088 |
+------+---------+
| org  |     836 |
+------+---------+
| pers |    2525 |
+------+---------+
| prod |     200 |
+------+---------+
| time |     276 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       7 |
+-----+---------+
| org |     444 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| comp.name              |       1 |
+------------------------+---------+
| loc.add.elec           |       1 |
+------------------------+---------+
| loc.add.phys           |       3 |
+------------------------+---------+
| l

## letemps

See the [README file](./documentation/README-letemps.md) for detailed information about this dataset.

### File paths

In [28]:
letemps_dev_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-dev-fr.tsv")
letemps_train_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-train-fr.tsv")

#### Dev

In [29]:
# parse the TSV into a list of `HipeDocument` objects
letemps_dev_fr_docs = parse_tsv(file_path=letemps_dev_fr_path)

In [30]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_dev_fr_docs))


Path of the TSV file: ./data/v2.0/letemps/fr/HIPE-2022-v2.0-letemps-dev-fr.tsv 
Number of documents: 51 
Number of entities: {'coarse_lit': 869, 'fine_lit': 869, 'nested': 12} 
Number of tokens: 38649 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     516 |
+------+---------+
| org  |      41 |
+------+---------+
| pers |     312 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      16 |
+----------------+---------+
| loc.add        |       2 |
+----------------+---------+
| loc.add.phys   |      21 |
+----------------+---------+
| loc.adm        |      20 |
+----------------+---------+
| loc.adm.nat    |     104 |
+----------------+---------+
| loc.adm.reg    |      45 |
+----------------+---------+
| loc.adm.town   |     270 |
+----------------+---------+
| loc.admin.sup  |       1 |
+----------------+---------+
| loc.phys.geo   |      25 |
+----------------+---------+
| loc.phys.hydro | 

#### Train

In [31]:
# parse the TSV into a list of `HipeDocument` objects
letemps_train_fr_docs = parse_tsv(file_path=letemps_train_fr_path)

In [32]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_train_fr_docs))


Path of the TSV file: ./data/v2.0/letemps/fr/HIPE-2022-v2.0-letemps-train-fr.tsv 
Number of documents: 414 
Number of entities: {'coarse_lit': 9159, 'fine_lit': 9159, 'nested': 69} 
Number of tokens: 379487 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    5260 |
+------+---------+
| org  |     472 |
+------+---------+
| pers |    3427 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |     262 |
+----------------+---------+
| loc.add        |       3 |
+----------------+---------+
| loc.add.phys   |     169 |
+----------------+---------+
| loc.adm        |     111 |
+----------------+---------+
| loc.adm.nat    |    1097 |
+----------------+---------+
| loc.adm.reg    |     477 |
+----------------+---------+
| loc.adm.town   |    2713 |
+----------------+---------+
| loc.admin.sup  |      33 |
+----------------+---------+
| loc.oro        |      13 |
+----------------+---------+
| loc.phys   

## topRes19th

See the [README file](./documentation/README-topres19th.md) for detailed information about this dataset.

### File paths

In [33]:
topRes19th_dev_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-dev-en.tsv")
topRes19th_train_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-train-en.tsv")

#### Dev

In [34]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_dev_en_docs = parse_tsv(file_path=topRes19th_dev_en_path)

In [35]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_dev_en_docs))


Path of the TSV file: ./data/v2.0/topres19th/en/HIPE-2022-v2.0-topres19th-dev-en.tsv 
Number of documents: 34 
Number of entities: {'coarse_lit': 237} 
Number of tokens: 11917 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |      19 |
+----------+---------+
| LOC      |     201 |
+----------+---------+
| STREET   |      17 |
+----------+---------+



#### Train

In [36]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_train_en_docs = parse_tsv(file_path=topRes19th_train_en_path)

In [37]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_train_en_docs))


Path of the TSV file: ./data/v2.0/topres19th/en/HIPE-2022-v2.0-topres19th-train-en.tsv 
Number of documents: 309 
Number of entities: {'coarse_lit': 3182} 
Number of tokens: 123983 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |     359 |
+----------+---------+
| LOC      |    2580 |
+----------+---------+
| STREET   |     243 |
+----------+---------+



## newseye

See the [README file](./documentation/README-newseye.md) for detailed information about this dataset.

### File paths

In [38]:
# FR
newseye_dev_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fr.tsv")
newseye_train_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-train-fr.tsv")

# DE
newseye_dev_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-dev-de.tsv")
newseye_train_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-train-de.tsv")

# FI
newseye_dev_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fi.tsv")
newseye_train_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-train-fi.tsv")

# SV
newseye_dev_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-dev-sv.tsv")
newseye_train_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-train-sv.tsv")

### newseye FR

#### Dev

In [39]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fr_docs = parse_tsv(file_path=newseye_dev_fr_path)

In [40]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fr_docs))


Path of the TSV file: ./data/v2.0/newseye/fr/HIPE-2022-v2.0-newseye-dev-fr.tsv 
Number of documents: 35 
Number of entities: {'coarse_lit': 752, 'fine_lit': 3, 'nested': 32} 
Number of tokens: 21727 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      11 |
+-----------+---------+
| LOC       |     335 |
+-----------+---------+
| ORG       |     113 |
+-----------+---------+
| PER       |     293 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       3 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |      18 |
+-----------+---------+
| ORG       |       7 |
+-----------+---------+
| PER       |       6 |
+-----------+---------+



#### Train

In [41]:
newseye_train_fr_path

'./data/v2.0/newseye/fr/HIPE-2022-v2.0-newseye-train-fr.tsv'

In [42]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)

In [43]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fr_docs))


Path of the TSV file: ./data/v2.0/newseye/fr/HIPE-2022-v2.0-newseye-train-fr.tsv 
Number of documents: 35 
Number of entities: {'coarse_lit': 10423, 'fine_lit': 99, 'nested': 522} 
Number of tokens: 255165 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     200 |
+-----------+---------+
| LOC       |    4055 |
+-----------+---------+
| ORG       |    1285 |
+-----------+---------+
| PER       |    4883 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      99 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       6 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     212 |
+-----------+---------+
| PER       |      41 |
+-----------+---------+



### newseye DE

#### Dev

In [44]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_de_docs = parse_tsv(file_path=newseye_dev_de_path)

In [45]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_de_docs))


Path of the TSV file: ./data/v2.0/newseye/de/HIPE-2022-v2.0-newseye-dev-de.tsv 
Number of documents: 12 
Number of entities: {'coarse_lit': 539, 'fine_lit': 5, 'nested': 29} 
Number of tokens: 40061 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |       4 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     123 |
+-----------+---------+
| PER       |     149 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       5 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      10 |
+-----+---------+
| ORG |      11 |
+-----+---------+
| PER |       8 |
+-----+---------+



#### Train

In [46]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_de_docs = parse_tsv(file_path=newseye_train_de_path)

In [47]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_de_docs))


Path of the TSV file: ./data/v2.0/newseye/de/HIPE-2022-v2.0-newseye-train-de.tsv 
Number of documents: 7 
Number of entities: {'coarse_lit': 11381, 'fine_lit': 21, 'nested': 1133} 
Number of tokens: 374332 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      37 |
+-----------+---------+
| LOC       |    5135 |
+-----------+---------+
| ORG       |    3108 |
+-----------+---------+
| PER       |    3101 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      21 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     754 |
+-----------+---------+
| ORG       |     136 |
+-----------+---------+
| PER       |     241 |
+-----------+---------+



### newseye FI

#### Dev

In [48]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fi_docs = parse_tsv(file_path=newseye_dev_fi_path)

In [49]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fi_docs))


Path of the TSV file: ./data/v2.0/newseye/fi/HIPE-2022-v2.0-newseye-dev-fi.tsv 
Number of documents: 24 
Number of entities: {'coarse_lit': 223, 'fine_lit': 1, 'nested': 26} 
Number of tokens: 6350 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      12 |
+-----------+---------+
| LOC       |      97 |
+-----------+---------+
| ORG       |      37 |
+-----------+---------+
| PER       |      77 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      22 |
+-----+---------+
| ORG |       2 |
+-----+---------+
| PER |       2 |
+-----+---------+



#### Train

In [50]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fi_docs = parse_tsv(file_path=newseye_train_fi_path)

In [51]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fi_docs))


Path of the TSV file: ./data/v2.0/newseye/fi/HIPE-2022-v2.0-newseye-train-fi.tsv 
Number of documents: 24 
Number of entities: {'coarse_lit': 2146, 'fine_lit': 15, 'nested': 225} 
Number of tokens: 48222 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     126 |
+-----------+---------+
| LOC       |     979 |
+-----------+---------+
| ORG       |     259 |
+-----------+---------+
| PER       |     782 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      15 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     171 |
+-----------+---------+
| ORG       |      35 |
+-----------+---------+
| PER       |      17 |
+-----------+---------+



### newseye SV

#### Dev

In [52]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_sv_docs = parse_tsv(file_path=newseye_dev_sv_path)

In [53]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_sv_docs))


Path of the TSV file: ./data/v2.0/newseye/sv/HIPE-2022-v2.0-newseye-dev-sv.tsv 
Number of documents: 21 
Number of entities: {'coarse_lit': 266, 'fine_lit': 1, 'nested': 9} 
Number of tokens: 6906 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      17 |
+-----------+---------+
| LOC       |     148 |
+-----------+---------+
| ORG       |      17 |
+-----------+---------+
| PER       |      84 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |       7 |
+-----+---------+
| ORG |       1 |
+-----+---------+
| PER |       1 |
+-----+---------+



#### Train

In [54]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_sv_docs = parse_tsv(file_path=newseye_train_sv_path)

In [55]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_sv_docs))


Path of the TSV file: ./data/v2.0/newseye/sv/HIPE-2022-v2.0-newseye-train-sv.tsv 
Number of documents: 21 
Number of entities: {'coarse_lit': 2141, 'fine_lit': 16, 'nested': 126} 
Number of tokens: 56306 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     164 |
+-----------+---------+
| LOC       |     985 |
+-----------+---------+
| ORG       |     153 |
+-----------+---------+
| PER       |     839 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      16 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |     100 |
+-----------+---------+
| ORG       |      15 |
+-----------+---------+
| PER       |      10 |
+-----------+---------+



## sonar

See the [README file](./documentation/README-sonar.md) for detailed information about this dataset.

### File paths

In [56]:
sonar_dev_de_path = os.path.join(HIPE2022_data_path, f"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-dev-de.tsv")

#### Dev

In [57]:
# parse the TSV into a list of `HipeDocument` objects
sonar_dev_de_docs = parse_tsv(file_path=sonar_dev_de_path)

In [58]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_dev_de_docs))


Path of the TSV file: ./data/v2.0/sonar/de/HIPE-2022-v2.0-sonar-dev-de.tsv 
Number of documents: 10 
Number of entities: {'coarse_lit': 654} 
Number of tokens: 17564 
Entity breakdown by type: coarse_lit
+-----+---------+
|     |   count |
| LOC |     300 |
+-----+---------+
| ORG |     138 |
+-----+---------+
| PER |     216 |
+-----+---------+

