# HIPE 2022 dataset statistics

# Imports

In [46]:
import os
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset, compute_entities_stats

# Functions

In [47]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# HIPE 2022 Datasets

In [48]:
RELEASE_VERSION = "v2.1"
HIPE2022_data_path = f"./data/{RELEASE_VERSION}/"

## ajmc

See the [README file](./documentation/README-ajmc.md) for detailed information about this dataset.

### File paths

In [49]:
# EN
ajmc_train_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-train-en.tsv")
ajmc_dev_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-en.tsv")
ajmc_test_en_path = os.path.join(HIPE2022_data_path, f"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-test-en.tsv")

# DE
ajmc_train_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-train-de.tsv")
ajmc_dev_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-de.tsv")
ajmc_test_de_path = os.path.join(HIPE2022_data_path, f"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-test-de.tsv")

# FR
ajmc_train_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-train-fr.tsv")
ajmc_dev_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-fr.tsv")
ajmc_test_fr_path = os.path.join(HIPE2022_data_path, f"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-test-fr.tsv")

### ajmc EN

#### Dev

In [50]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_en_docs = parse_tsv(file_path=ajmc_dev_en_path)

In [51]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_en_docs))


Path of the TSV file: ./data/v2.1/ajmc/en/HIPE-2022-v2.1-ajmc-dev-en.tsv 
Number of documents: 14 
Number of entities: {'coarse_lit': 416, 'fine_lit': 416} 
Number of tokens: 6506 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       5 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |     130 |
+-------+---------+
| scope |     162 |
+-------+---------+
| work  |     116 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       5 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      69 |
+--------------+---------+
| pers.editor  |       9 |
+--------------+---------+
| pers.myth    |      50 |
+--------------+---------+
| pers.other   |       2 |
+--------------+---------+
| scope        |     162 |
+--------------+---------+
| work.fragm   |       1 |
+--------------+---------+
| work.primlit |     103 |
+--------------+---

#### Train

In [52]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_en_docs = parse_tsv(file_path=ajmc_train_en_path)

In [53]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_en_docs))


Path of the TSV file: ./data/v2.1/ajmc/en/HIPE-2022-v2.1-ajmc-train-en.tsv 
Number of documents: 60 
Number of entities: {'coarse_lit': 1823, 'fine_lit': 1823, 'nested': 4} 
Number of tokens: 30932 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |      12 |
+--------+---------+
| loc    |      39 |
+--------+---------+
| object |       3 |
+--------+---------+
| pers   |     618 |
+--------+---------+
| scope  |     684 |
+--------+---------+
| work   |     467 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |      12 |
+----------------+---------+
| loc            |      39 |
+----------------+---------+
| object.manuscr |       3 |
+----------------+---------+
| pers.author    |     237 |
+----------------+---------+
| pers.editor    |      28 |
+----------------+---------+
| pers.myth      |     331 |
+----------------+---------+
| pers.other     |      22 |
+----------------+-------

#### Test

In [54]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_en_docs = parse_tsv(file_path=ajmc_test_en_path)

In [55]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_en_docs))


Path of the TSV file: ./data/v2.1/ajmc/en/HIPE-2022-v2.1-ajmc-test-en.tsv 
Number of documents: 13 
Number of entities: {'coarse_lit': 348, 'fine_lit': 348} 
Number of tokens: 6052 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       3 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |      96 |
+-------+---------+
| scope |     151 |
+-------+---------+
| work  |      95 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       3 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      47 |
+--------------+---------+
| pers.editor  |       4 |
+--------------+---------+
| pers.myth    |      41 |
+--------------+---------+
| pers.other   |       4 |
+--------------+---------+
| scope        |     151 |
+--------------+---------+
| work.fragm   |       2 |
+--------------+---------+
| work.primlit |      83 |
+--------------+--

### ajmc DE

#### Dev

In [56]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_de_docs = parse_tsv(file_path=ajmc_dev_de_path)

In [57]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_de_docs))


Path of the TSV file: ./data/v2.1/ajmc/de/HIPE-2022-v2.1-ajmc-dev-de.tsv 
Number of documents: 14 
Number of entities: {'coarse_lit': 403, 'fine_lit': 403, 'nested': 2} 
Number of tokens: 4702 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| loc    |      10 |
+--------+---------+
| object |       4 |
+--------+---------+
| pers   |     162 |
+--------+---------+
| scope  |     157 |
+--------+---------+
| work   |      70 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      10 |
+----------------+---------+
| object.manuscr |       4 |
+----------------+---------+
| pers.author    |      65 |
+----------------+---------+
| pers.editor    |       7 |
+----------------+---------+
| pers.myth      |      87 |
+----------------+---------+
| pers.other     |       3 |
+----------------+---------+
| scope          |     157 |
+----------------+---------+
| work.primlit   |      70 |
+-------------

#### Train

In [58]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_de_docs = parse_tsv(file_path=ajmc_train_de_path)


In [59]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_de_docs))


Path of the TSV file: ./data/v2.1/ajmc/de/HIPE-2022-v2.1-ajmc-train-de.tsv 
Number of documents: 76 
Number of entities: {'coarse_lit': 1738, 'fine_lit': 1738, 'nested': 11} 
Number of tokens: 22695 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |       2 |
+--------+---------+
| loc    |      31 |
+--------+---------+
| object |       6 |
+--------+---------+
| pers   |     620 |
+--------+---------+
| scope  |     758 |
+--------+---------+
| work   |     321 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |       2 |
+----------------+---------+
| loc            |      31 |
+----------------+---------+
| object.manuscr |       6 |
+----------------+---------+
| pers.author    |     354 |
+----------------+---------+
| pers.editor    |      16 |
+----------------+---------+
| pers.myth      |     234 |
+----------------+---------+
| pers.other     |      16 |
+----------------+------

#### Test

In [60]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_de_docs = parse_tsv(file_path=ajmc_test_de_path)

In [61]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_de_docs))


Path of the TSV file: ./data/v2.1/ajmc/de/HIPE-2022-v2.1-ajmc-test-de.tsv 
Number of documents: 16 
Number of entities: {'coarse_lit': 382, 'fine_lit': 382} 
Number of tokens: 4845 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| loc    |       2 |
+--------+---------+
| object |       2 |
+--------+---------+
| pers   |     128 |
+--------+---------+
| scope  |     176 |
+--------+---------+
| work   |      74 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |       2 |
+----------------+---------+
| object.manuscr |       2 |
+----------------+---------+
| pers.author    |      48 |
+----------------+---------+
| pers.editor    |       4 |
+----------------+---------+
| pers.myth      |      76 |
+----------------+---------+
| scope          |     176 |
+----------------+---------+
| work.primlit   |      74 |
+----------------+---------+



### ajmc FR

#### Dev

In [62]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_dev_fr_docs = parse_tsv(file_path=ajmc_dev_fr_path)

In [63]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_dev_fr_docs))


Path of the TSV file: ./data/v2.1/ajmc/fr/HIPE-2022-v2.1-ajmc-dev-fr.tsv 
Number of documents: 17 
Number of entities: {'coarse_lit': 391, 'fine_lit': 391} 
Number of tokens: 5425 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| pers  |     123 |
+-------+---------+
| scope |     169 |
+-------+---------+
| work  |      99 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| pers.author  |      52 |
+--------------+---------+
| pers.editor  |      13 |
+--------------+---------+
| pers.myth    |      23 |
+--------------+---------+
| pers.other   |      35 |
+--------------+---------+
| scope        |     169 |
+--------------+---------+
| work.primlit |      99 |
+--------------+---------+



#### Train

In [64]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_train_fr_docs = parse_tsv(file_path=ajmc_train_fr_path)

In [65]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_train_fr_docs))


Path of the TSV file: ./data/v2.1/ajmc/fr/HIPE-2022-v2.1-ajmc-train-fr.tsv 
Number of documents: 72 
Number of entities: {'coarse_lit': 1621, 'fine_lit': 1621, 'nested': 9} 
Number of tokens: 24669 
Entity breakdown by type: coarse_lit
+--------+---------+
|        |   count |
| date   |       2 |
+--------+---------+
| loc    |      15 |
+--------+---------+
| object |      10 |
+--------+---------+
| pers   |     577 |
+--------+---------+
| scope  |     639 |
+--------+---------+
| work   |     378 |
+--------+---------+
fine_lit
+----------------+---------+
|                |   count |
| date           |       2 |
+----------------+---------+
| loc            |      15 |
+----------------+---------+
| object.manuscr |      10 |
+----------------+---------+
| pers.author    |     294 |
+----------------+---------+
| pers.editor    |      57 |
+----------------+---------+
| pers.myth      |      83 |
+----------------+---------+
| pers.other     |     143 |
+----------------+-------

#### Test

In [66]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_test_fr_docs = parse_tsv(file_path=ajmc_test_fr_path)

In [67]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_test_fr_docs))


Path of the TSV file: ./data/v2.1/ajmc/fr/HIPE-2022-v2.1-ajmc-test-fr.tsv 
Number of documents: 15 
Number of entities: {'coarse_lit': 360, 'fine_lit': 360} 
Number of tokens: 5390 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       3 |
+-------+---------+
| loc   |       9 |
+-------+---------+
| pers  |     139 |
+-------+---------+
| scope |     129 |
+-------+---------+
| work  |      80 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       3 |
+--------------+---------+
| loc          |       9 |
+--------------+---------+
| pers.author  |      78 |
+--------------+---------+
| pers.editor  |      35 |
+--------------+---------+
| pers.myth    |      14 |
+--------------+---------+
| pers.other   |      12 |
+--------------+---------+
| scope        |     129 |
+--------------+---------+
| work.fragm   |       1 |
+--------------+---------+
| work.primlit |      78 |
+--------------+--

## hipe2020

See the [README file](./documentation/README-hipe2020.md) for detailed information about this dataset.

### File paths

In [68]:
# EN
hipe2020_dev_en_path = os.path.join(HIPE2022_data_path, f"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-en.tsv")
hipe2020_test_en_path = os.path.join(HIPE2022_data_path, f"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-en.tsv")

# DE
hipe2020_train_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-de.tsv")
hipe2020_dev_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-de.tsv")
hipe2020_test_de_path = os.path.join(HIPE2022_data_path, f"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-de.tsv")


# FR
hipe2020_dev_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-fr.tsv")
hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-fr.tsv")
hipe2020_test_fr_path = os.path.join(HIPE2022_data_path, f"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-fr.tsv")

### hipe2020 EN

#### Dev


In [69]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_en_docs = parse_tsv(file_path=hipe2020_dev_en_path)

In [70]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_en_docs))


Path of the TSV file: ./data/v2.1/hipe2020/en/HIPE-2022-v2.1-hipe2020-dev-en.tsv 
Number of documents: 80 
Number of entities: {'coarse_lit': 966, 'coarse_meto': 15} 
Number of tokens: 29063 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     384 |
+------+---------+
| org  |     118 |
+------+---------+
| pers |     402 |
+------+---------+
| prod |      33 |
+------+---------+
| time |      29 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       6 |
+-----+---------+
| org |       9 |
+-----+---------+



#### Test


In [71]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_en_docs = parse_tsv(file_path=hipe2020_test_en_path)

In [72]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_en_docs))


Path of the TSV file: ./data/v2.1/hipe2020/en/HIPE-2022-v2.1-hipe2020-test-en.tsv 
Number of documents: 46 
Number of entities: {'coarse_lit': 449, 'coarse_meto': 25} 
Number of tokens: 16634 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     181 |
+------+---------+
| org  |      76 |
+------+---------+
| pers |     156 |
+------+---------+
| prod |      19 |
+------+---------+
| time |      17 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       3 |
+-----+---------+
| org |      22 |
+-----+---------+



### hipe2020 DE

#### Dev

In [73]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_de_docs = parse_tsv(file_path=hipe2020_dev_de_path)

In [74]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_de_docs))


Path of the TSV file: ./data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-dev-de.tsv 
Number of documents: 33 
Number of entities: {'coarse_lit': 1242, 'coarse_meto': 91, 'fine_lit': 1242, 'fine_meto': 91, 'fine_comp': 468, 'nested': 67} 
Number of tokens: 32671 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     588 |
+------+---------+
| org  |     164 |
+------+---------+
| pers |     372 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      69 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      12 |
+------+---------+
| org  |      78 |
+------+---------+
| prod |       1 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.adm.nat         |     135 |
+---------------------+---------+
| loc.adm.reg         |      53 |
+---------------------+---------+
| loc.adm.sup         |       8 |
+---------------------+---------+
| loc.adm.town        |     3

#### Train

In [75]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_de_docs = parse_tsv(file_path=hipe2020_train_de_path)

In [76]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_de_docs))


Path of the TSV file: ./data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-train-de.tsv 
Number of documents: 103 
Number of entities: {'coarse_lit': 3494, 'coarse_meto': 325, 'fine_lit': 3494, 'fine_meto': 325, 'fine_comp': 1436, 'nested': 158} 
Number of tokens: 86445 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    1740 |
+------+---------+
| org  |     358 |
+------+---------+
| pers |    1166 |
+------+---------+
| prod |     112 |
+------+---------+
| time |     118 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      17 |
+------+---------+
| org  |     306 |
+------+---------+
| pers |       2 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.add.phys        |       2 |
+---------------------+---------+
| loc.adm.nat         |     563 |
+---------------------+---------+
| loc.adm.reg         |     199 |
+---------------------+---------+
| loc.adm.sup         

#### Test


In [77]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_de_docs = parse_tsv(file_path=hipe2020_test_de_path)

In [78]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_de_docs))


Path of the TSV file: ./data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-test-de.tsv 
Number of documents: 49 
Number of entities: {'coarse_lit': 1147, 'coarse_meto': 118, 'fine_lit': 1147, 'fine_meto': 118, 'fine_comp': 431, 'nested': 73} 
Number of tokens: 30737 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     595 |
+------+---------+
| org  |     130 |
+------+---------+
| pers |     311 |
+------+---------+
| prod |      62 |
+------+---------+
| time |      49 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |       1 |
+------+---------+
| org  |     116 |
+------+---------+
| pers |       1 |
+------+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.adm.nat            |     160 |
+------------------------+---------+
| loc.adm.reg            |      84 |
+------------------------+---------+
| loc.adm.sup            |      21 |
+------------------------+---------+
| 

### hipe2020 FR

#### Dev

In [79]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)

In [80]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_fr_docs))


Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-dev-fr.tsv 
Number of documents: 43 
Number of entities: {'coarse_lit': 1729, 'coarse_meto': 108, 'fine_lit': 1729, 'fine_meto': 108, 'fine_comp': 724, 'nested': 91} 
Number of tokens: 37953 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     774 |
+------+---------+
| org  |     159 |
+------+---------+
| pers |     679 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      68 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       3 |
+-----+---------+
| org |     105 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.add.phys           |       1 |
+------------------------+---------+
| loc.adm.nat            |     258 |
+------------------------+---------+
| loc.adm.reg            |      73 |
+------------------------+---------+
| loc.adm.sup            |      27 |
+---------

#### Train

In [81]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_fr_docs = parse_tsv(file_path=hipe2020_train_fr_path)

In [82]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_fr_docs))


Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-train-fr.tsv 
Number of documents: 158 
Number of entities: {'coarse_lit': 6926, 'coarse_meto': 450, 'fine_lit': 6926, 'fine_meto': 450, 'fine_comp': 3050, 'nested': 473} 
Number of tokens: 166220 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    3089 |
+------+---------+
| org  |     836 |
+------+---------+
| pers |    2525 |
+------+---------+
| prod |     200 |
+------+---------+
| time |     276 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       7 |
+-----+---------+
| org |     443 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.add.elec           |       1 |
+------------------------+---------+
| loc.add.phys           |       3 |
+------------------------+---------+
| loc.adm.nat            |     648 |
+------------------------+---------+
| loc.adm.reg            |     376 |
+---

#### Test


In [83]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_test_fr_docs = parse_tsv(file_path=hipe2020_test_fr_path)

In [84]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_test_fr_docs))


Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-test-fr.tsv 
Number of documents: 43 
Number of entities: {'coarse_lit': 1600, 'coarse_meto': 112, 'fine_lit': 1600, 'fine_meto': 112, 'fine_comp': 709, 'nested': 82} 
Number of tokens: 40854 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     854 |
+------+---------+
| org  |     130 |
+------+---------+
| pers |     502 |
+------+---------+
| prod |      61 |
+------+---------+
| time |      53 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| org  |     111 |
+------+---------+
| time |       1 |
+------+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.adm.nat            |     151 |
+------------------------+---------+
| loc.adm.reg            |     147 |
+------------------------+---------+
| loc.adm.sup            |      19 |
+------------------------+---------+
| loc.adm.town           |     446 |
+--

## letemps

See the [README file](./documentation/README-letemps.md) for detailed information about this dataset.

### File paths

In [85]:
letemps_dev_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-dev-fr.tsv")
letemps_train_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-train-fr.tsv")
letemps_test_fr_path = os.path.join(HIPE2022_data_path, f"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-test-fr.tsv")

#### Dev

In [86]:
# parse the TSV into a list of `HipeDocument` objects
letemps_dev_fr_docs = parse_tsv(file_path=letemps_dev_fr_path)

In [87]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_dev_fr_docs))


Path of the TSV file: ./data/v2.1/letemps/fr/HIPE-2022-v2.1-letemps-dev-fr.tsv 
Number of documents: 51 
Number of entities: {'coarse_lit': 869, 'fine_lit': 869, 'nested': 12} 
Number of tokens: 38649 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     516 |
+------+---------+
| org  |      41 |
+------+---------+
| pers |     312 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      16 |
+----------------+---------+
| loc.add        |       2 |
+----------------+---------+
| loc.add.phys   |      21 |
+----------------+---------+
| loc.adm        |      20 |
+----------------+---------+
| loc.adm.nat    |     104 |
+----------------+---------+
| loc.adm.reg    |      45 |
+----------------+---------+
| loc.adm.town   |     270 |
+----------------+---------+
| loc.admin.sup  |       1 |
+----------------+---------+
| loc.phys.geo   |      25 |
+----------------+---------+
| loc.phys.hydro | 

#### Train

In [88]:
# parse the TSV into a list of `HipeDocument` objects
letemps_train_fr_docs = parse_tsv(file_path=letemps_train_fr_path)

In [89]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_train_fr_docs))


Path of the TSV file: ./data/v2.1/letemps/fr/HIPE-2022-v2.1-letemps-train-fr.tsv 
Number of documents: 414 
Number of entities: {'coarse_lit': 9159, 'fine_lit': 9159, 'nested': 69} 
Number of tokens: 379487 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    5260 |
+------+---------+
| org  |     472 |
+------+---------+
| pers |    3427 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |     262 |
+----------------+---------+
| loc.add        |       3 |
+----------------+---------+
| loc.add.phys   |     169 |
+----------------+---------+
| loc.adm        |     111 |
+----------------+---------+
| loc.adm.nat    |    1097 |
+----------------+---------+
| loc.adm.reg    |     477 |
+----------------+---------+
| loc.adm.town   |    2713 |
+----------------+---------+
| loc.admin.sup  |      33 |
+----------------+---------+
| loc.oro        |      13 |
+----------------+---------+
| loc.phys   

#### Dev

In [90]:
# parse the TSV into a list of `HipeDocument` objects
letemps_test_fr_docs = parse_tsv(file_path=letemps_test_fr_path)

In [91]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_test_fr_docs))


Path of the TSV file: ./data/v2.1/letemps/fr/HIPE-2022-v2.1-letemps-test-fr.tsv 
Number of documents: 51 
Number of entities: {'coarse_lit': 1017, 'fine_lit': 1017, 'nested': 12} 
Number of tokens: 48468 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     591 |
+------+---------+
| org  |      79 |
+------+---------+
| pers |     347 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      16 |
+----------------+---------+
| loc.add.phys   |       8 |
+----------------+---------+
| loc.adm        |      10 |
+----------------+---------+
| loc.adm.nat    |     155 |
+----------------+---------+
| loc.adm.reg    |      45 |
+----------------+---------+
| loc.adm.town   |     311 |
+----------------+---------+
| loc.admin.sup  |       3 |
+----------------+---------+
| loc.phys       |       6 |
+----------------+---------+
| loc.phys.geo   |      26 |
+----------------+---------+
| loc.phys.hydro

## topRes19th

See the [README file](./documentation/README-topres19th.md) for detailed information about this dataset.

### File paths

In [92]:
topRes19th_dev_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-dev-en.tsv")
topRes19th_train_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-train-en.tsv")
#topRes19th_test_en_path = os.path.join(HIPE2022_data_path, f"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-test-en.tsv")

#### Dev

In [93]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_dev_en_docs = parse_tsv(file_path=topRes19th_dev_en_path)

In [94]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_dev_en_docs))


Path of the TSV file: ./data/v2.1/topres19th/en/HIPE-2022-v2.1-topres19th-dev-en.tsv 
Number of documents: 34 
Number of entities: {'coarse_lit': 237} 
Number of tokens: 11917 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |      19 |
+----------+---------+
| LOC      |     201 |
+----------+---------+
| STREET   |      17 |
+----------+---------+



#### Train

In [95]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_train_en_docs = parse_tsv(file_path=topRes19th_train_en_path)

In [96]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_train_en_docs))


Path of the TSV file: ./data/v2.1/topres19th/en/HIPE-2022-v2.1-topres19th-train-en.tsv 
Number of documents: 309 
Number of entities: {'coarse_lit': 3182} 
Number of tokens: 123983 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |     359 |
+----------+---------+
| LOC      |    2580 |
+----------+---------+
| STREET   |     243 |
+----------+---------+



#### Test

(lines below are to be commented out once the test files for topRes are published)

In [97]:
# parse the TSV into a list of `HipeDocument` objects
#topRes19th_test_en_docs = parse_tsv(file_path=topRes19th_test_en_path)

In [98]:
# print some basic stats for the TSV dataset 
#print(describe_dataset(documents=topRes19th_dev_en_docs))

## newseye

See the [README file](./documentation/README-newseye.md) for detailed information about this dataset.

### File paths

In [99]:
# FR
newseye_dev_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fr.tsv")
newseye_train_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-train-fr.tsv")
newseye_test_fr_path = os.path.join(HIPE2022_data_path, f"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-test-fr.tsv")

# DE
newseye_dev_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-dev-de.tsv")
newseye_train_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-train-de.tsv")
newseye_test_de_path = os.path.join(HIPE2022_data_path, f"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-test-de.tsv")

# FI
newseye_dev_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fi.tsv")
newseye_train_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-train-fi.tsv")
newseye_test_fi_path = os.path.join(HIPE2022_data_path, f"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-test-fi.tsv")

# SV
newseye_dev_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-dev-sv.tsv")
newseye_train_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-train-sv.tsv")
newseye_test_sv_path = os.path.join(HIPE2022_data_path, f"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-test-sv.tsv")

### newseye FR

#### Dev

In [100]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fr_docs = parse_tsv(file_path=newseye_dev_fr_path)

In [101]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fr_docs))


Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-dev-fr.tsv 
Number of documents: 35 
Number of entities: {'coarse_lit': 752, 'fine_lit': 3, 'nested': 32} 
Number of tokens: 21727 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      11 |
+-----------+---------+
| LOC       |     335 |
+-----------+---------+
| ORG       |     113 |
+-----------+---------+
| PER       |     293 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       3 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |      18 |
+-----------+---------+
| ORG       |       7 |
+-----------+---------+
| PER       |       6 |
+-----------+---------+



#### Train

In [102]:
newseye_train_fr_path

'./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv'

In [103]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)

In [104]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fr_docs))


Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv 
Number of documents: 35 
Number of entities: {'coarse_lit': 10423, 'fine_lit': 99, 'nested': 522} 
Number of tokens: 255165 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     200 |
+-----------+---------+
| LOC       |    4055 |
+-----------+---------+
| ORG       |    1285 |
+-----------+---------+
| PER       |    4883 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      99 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       6 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     212 |
+-----------+---------+
| PER       |      41 |
+-----------+---------+



#### Test

In [105]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_fr_docs = parse_tsv(file_path=newseye_test_fr_path)

In [106]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_fr_docs))


Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-test-fr.tsv 
Number of documents: 35 
Number of entities: {'coarse_lit': 2530, 'fine_lit': 34, 'nested': 136} 
Number of tokens: 70794 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      33 |
+-----------+---------+
| LOC       |    1112 |
+-----------+---------+
| ORG       |     360 |
+-----------+---------+
| PER       |    1025 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      34 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       5 |
+-----------+---------+
| LOC       |      85 |
+-----------+---------+
| ORG       |      31 |
+-----------+---------+
| PER       |      15 |
+-----------+---------+



### newseye DE

#### Dev

In [107]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_de_docs = parse_tsv(file_path=newseye_dev_de_path)

In [108]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_de_docs))


Path of the TSV file: ./data/v2.1/newseye/de/HIPE-2022-v2.1-newseye-dev-de.tsv 
Number of documents: 12 
Number of entities: {'coarse_lit': 539, 'fine_lit': 5, 'nested': 29} 
Number of tokens: 40061 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |       4 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     123 |
+-----------+---------+
| PER       |     149 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       5 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      10 |
+-----+---------+
| ORG |      11 |
+-----+---------+
| PER |       8 |
+-----+---------+



#### Train

In [109]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_de_docs = parse_tsv(file_path=newseye_train_de_path)

In [110]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_de_docs))


Path of the TSV file: ./data/v2.1/newseye/de/HIPE-2022-v2.1-newseye-train-de.tsv 
Number of documents: 7 
Number of entities: {'coarse_lit': 11381, 'fine_lit': 21, 'nested': 1133} 
Number of tokens: 374332 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      37 |
+-----------+---------+
| LOC       |    5135 |
+-----------+---------+
| ORG       |    3108 |
+-----------+---------+
| PER       |    3101 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      21 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     754 |
+-----------+---------+
| ORG       |     136 |
+-----------+---------+
| PER       |     241 |
+-----------+---------+



#### Test

In [111]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_de_docs = parse_tsv(file_path=newseye_test_de_path)

In [112]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_de_docs))


Path of the TSV file: ./data/v2.1/newseye/de/HIPE-2022-v2.1-newseye-test-de.tsv 
Number of documents: 13 
Number of entities: {'coarse_lit': 2401, 'fine_lit': 13, 'nested': 100} 
Number of tokens: 99775 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      15 |
+-----------+---------+
| LOC       |    1222 |
+-----------+---------+
| ORG       |     353 |
+-----------+---------+
| PER       |     811 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      13 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |      30 |
+-----------+---------+
| ORG       |      29 |
+-----------+---------+
| PER       |      39 |
+-----------+---------+



### newseye FI

#### Dev

In [113]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fi_docs = parse_tsv(file_path=newseye_dev_fi_path)

In [114]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fi_docs))


Path of the TSV file: ./data/v2.1/newseye/fi/HIPE-2022-v2.1-newseye-dev-fi.tsv 
Number of documents: 24 
Number of entities: {'coarse_lit': 223, 'fine_lit': 1, 'nested': 26} 
Number of tokens: 6350 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      12 |
+-----------+---------+
| LOC       |      97 |
+-----------+---------+
| ORG       |      37 |
+-----------+---------+
| PER       |      77 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      22 |
+-----+---------+
| ORG |       2 |
+-----+---------+
| PER |       2 |
+-----+---------+



#### Train

In [115]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fi_docs = parse_tsv(file_path=newseye_train_fi_path)

In [116]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fi_docs))


Path of the TSV file: ./data/v2.1/newseye/fi/HIPE-2022-v2.1-newseye-train-fi.tsv 
Number of documents: 24 
Number of entities: {'coarse_lit': 2146, 'fine_lit': 15, 'nested': 225} 
Number of tokens: 48222 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     126 |
+-----------+---------+
| LOC       |     979 |
+-----------+---------+
| ORG       |     259 |
+-----------+---------+
| PER       |     782 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      15 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     171 |
+-----------+---------+
| ORG       |      35 |
+-----------+---------+
| PER       |      17 |
+-----------+---------+



#### Test

In [117]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_fi_docs = parse_tsv(file_path=newseye_test_fi_path)

In [118]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_fi_docs))


Path of the TSV file: ./data/v2.1/newseye/fi/HIPE-2022-v2.1-newseye-test-fi.tsv 
Number of documents: 24 
Number of entities: {'coarse_lit': 691, 'fine_lit': 7, 'nested': 42} 
Number of tokens: 14963 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      22 |
+-----------+---------+
| LOC       |     262 |
+-----------+---------+
| ORG       |      54 |
+-----------+---------+
| PER       |     353 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       7 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      29 |
+-----+---------+
| ORG |      10 |
+-----+---------+
| PER |       3 |
+-----+---------+



### newseye SV

#### Dev

In [119]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_sv_docs = parse_tsv(file_path=newseye_dev_sv_path)

In [120]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_sv_docs))


Path of the TSV file: ./data/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-dev-sv.tsv 
Number of documents: 21 
Number of entities: {'coarse_lit': 266, 'fine_lit': 1, 'nested': 9} 
Number of tokens: 6906 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      17 |
+-----------+---------+
| LOC       |     148 |
+-----------+---------+
| ORG       |      17 |
+-----------+---------+
| PER       |      84 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |       7 |
+-----+---------+
| ORG |       1 |
+-----+---------+
| PER |       1 |
+-----+---------+



#### Train

In [121]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_sv_docs = parse_tsv(file_path=newseye_train_sv_path)

In [122]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_sv_docs))


Path of the TSV file: ./data/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-train-sv.tsv 
Number of documents: 21 
Number of entities: {'coarse_lit': 2141, 'fine_lit': 16, 'nested': 126} 
Number of tokens: 56306 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     164 |
+-----------+---------+
| LOC       |     985 |
+-----------+---------+
| ORG       |     153 |
+-----------+---------+
| PER       |     839 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      16 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |     100 |
+-----------+---------+
| ORG       |      15 |
+-----------+---------+
| PER       |      10 |
+-----------+---------+



#### Test

In [123]:
# parse the TSV into a list of `HipeDocument` objects
newseye_test_sv_docs = parse_tsv(file_path=newseye_test_sv_path)

In [124]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_test_sv_docs))


Path of the TSV file: ./data/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-test-sv.tsv 
Number of documents: 21 
Number of entities: {'coarse_lit': 604, 'nested': 26} 
Number of tokens: 16162 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      22 |
+-----------+---------+
| LOC       |     313 |
+-----------+---------+
| ORG       |      60 |
+-----------+---------+
| PER       |     209 |
+-----------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      20 |
+-----+---------+
| ORG |       2 |
+-----+---------+
| PER |       4 |
+-----+---------+



## sonar

See the [README file](./documentation/README-sonar.md) for detailed information about this dataset.

### File paths

In [125]:
sonar_dev_de_path = os.path.join(HIPE2022_data_path, f"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-dev-de.tsv")
sonar_test_de_path = os.path.join(HIPE2022_data_path, f"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-test-de.tsv")

#### Dev

In [126]:
# parse the TSV into a list of `HipeDocument` objects
sonar_dev_de_docs = parse_tsv(file_path=sonar_dev_de_path)

In [127]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_dev_de_docs))


Path of the TSV file: ./data/v2.1/sonar/de/HIPE-2022-v2.1-sonar-dev-de.tsv 
Number of documents: 10 
Number of entities: {'coarse_lit': 654} 
Number of tokens: 17476 
Entity breakdown by type: coarse_lit
+-----+---------+
|     |   count |
| LOC |     300 |
+-----+---------+
| ORG |     138 |
+-----+---------+
| PER |     216 |
+-----+---------+



#### Test

In [128]:
# parse the TSV into a list of `HipeDocument` objects
sonar_test_de_docs = parse_tsv(file_path=sonar_test_de_path)

In [129]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_test_de_docs))


Path of the TSV file: ./data/v2.1/sonar/de/HIPE-2022-v2.1-sonar-test-de.tsv 
Number of documents: 10 
Number of entities: {'coarse_lit': 471} 
Number of tokens: 15463 
Entity breakdown by type: coarse_lit
+-----+---------+
|     |   count |
| LOC |     177 |
+-----+---------+
| ORG |     111 |
+-----+---------+
| PER |     183 |
+-----+---------+

