# HIPE 2022 dataset statistics

# Imports

In [1]:
import os
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset

# Functions

In [2]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# HIPE 2022 Datasets

In [30]:
HIPE2022_data_path = "./data/v1.0/"

## ajmc

See the [README file](./documentation/README-ajmc.md) for detailed information about this dataset.

### File paths

In [18]:
ajmc_sample_en_path = os.path.join(HIPE2022_data_path, "ajmc/en/HIPE-2022-v1.0-ajmc-sample-en.tsv")

In [19]:
ajmc_sample_de_path = os.path.join(HIPE2022_data_path, "ajmc/de/HIPE-2022-v1.0-ajmc-sample-de.tsv")

### EN sample

In [25]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_sample_en_docs = parse_tsv(file_path=ajmc_sample_en_path)

In [27]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_sample_en_docs))


Path of the TSV file: ./data/v1.0/ajmc/en/HIPE-2022-v1.0-ajmc-sample-en.tsv 
Number of documents: 5 
Number of entities: {'coarse_lit': 153, 'fine_lit': 153} 
Number of tokens: 2187 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       1 |
+-------+---------+
| loc   |       5 |
+-------+---------+
| pers  |      65 |
+-------+---------+
| scope |      51 |
+-------+---------+
| work  |      31 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       1 |
+--------------+---------+
| loc          |       5 |
+--------------+---------+
| pers.author  |      28 |
+--------------+---------+
| pers.editor  |       6 |
+--------------+---------+
| pers.myth    |      30 |
+--------------+---------+
| pers.other   |       1 |
+--------------+---------+
| scope        |      51 |
+--------------+---------+
| work.primlit |      30 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+-

### DE sample

In [28]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_sample_de_docs = parse_tsv(file_path=ajmc_sample_de_path)

In [29]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_sample_de_docs))


Path of the TSV file: ./data/v1.0/ajmc/de/HIPE-2022-v1.0-ajmc-sample-de.tsv 
Number of documents: 8 
Number of entities: {'coarse_lit': 202, 'fine_lit': 202, 'nested': 7} 
Number of tokens: 2584 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       2 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |      92 |
+-------+---------+
| scope |      79 |
+-------+---------+
| work  |      26 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       2 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      36 |
+--------------+---------+
| pers.myth    |      54 |
+--------------+---------+
| pers.other   |       2 |
+--------------+---------+
| scope        |      79 |
+--------------+---------+
| work.primlit |      25 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+---------+
nested
+-------+---------+
|   

## hipe2020

See the [README file](./documentation/README-hipe2020.md) for detailed information about this dataset.

### File paths

In [48]:
hipe2020_dev_en_path = os.path.join(HIPE2022_data_path, "hipe2020/en/HIPE-2022-v1.0-hipe2020-dev-en.tsv")

In [42]:
hipe2020_train_de_path = os.path.join(HIPE2022_data_path, "hipe2020/de/HIPE-2022-v1.0-hipe2020-train-de.tsv")

In [43]:
hipe2020_dev_de_path = os.path.join(HIPE2022_data_path, "hipe2020/de/HIPE-2022-v1.0-hipe2020-dev-de.tsv")

In [44]:
hipe2020_dev_fr_path = os.path.join(HIPE2022_data_path, "hipe2020/fr/HIPE-2022-v1.0-hipe2020-dev-fr.tsv")

In [45]:
hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, "hipe2020/fr/HIPE-2022-v1.0-hipe2020-train-fr.tsv")

### hipe2020 EN

#### Dev

In [49]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_en_docs = parse_tsv(file_path=hipe2020_dev_en_path)

In [51]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_en_docs))


Path of the TSV file: ./data/v1.0/hipe2020/en/HIPE-2022-v1.0-hipe2020-dev-en.tsv 
Number of documents: 80 
Number of entities: {'coarse_lit': 966, 'coarse_meto': 15} 
Number of tokens: 29063 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     384 |
+------+---------+
| org  |     118 |
+------+---------+
| pers |     402 |
+------+---------+
| prod |      33 |
+------+---------+
| time |      29 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       6 |
+-----+---------+
| org |       9 |
+-----+---------+



### hipe2020 DE

#### Dev

In [52]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_de_docs = parse_tsv(file_path=hipe2020_dev_de_path)

In [53]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_de_docs))


Path of the TSV file: ./data/v1.0/hipe2020/de/HIPE-2022-v1.0-hipe2020-dev-de.tsv 
Number of documents: 33 
Number of entities: {'coarse_lit': 1242, 'coarse_meto': 91, 'fine_lit': 1242, 'fine_meto': 91, 'fine_comp': 468, 'nested': 67} 
Number of tokens: 32671 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     588 |
+------+---------+
| org  |     164 |
+------+---------+
| pers |     372 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      69 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      12 |
+------+---------+
| org  |      78 |
+------+---------+
| prod |       1 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.adm.nat         |     135 |
+---------------------+---------+
| loc.adm.reg         |      53 |
+---------------------+---------+
| loc.adm.sup         |       8 |
+---------------------+---------+
| loc.adm.town        |     3

#### Train

In [54]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_de_docs = parse_tsv(file_path=hipe2020_train_de_path)

In [55]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_de_docs))


Path of the TSV file: ./data/v1.0/hipe2020/de/HIPE-2022-v1.0-hipe2020-train-de.tsv 
Number of documents: 103 
Number of entities: {'coarse_lit': 3494, 'coarse_meto': 325, 'fine_lit': 3494, 'fine_meto': 325, 'fine_comp': 1436, 'nested': 158} 
Number of tokens: 86445 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    1740 |
+------+---------+
| org  |     358 |
+------+---------+
| pers |    1166 |
+------+---------+
| prod |     112 |
+------+---------+
| time |     118 |
+------+---------+
coarse_meto
+------+---------+
|      |   count |
| loc  |      17 |
+------+---------+
| org  |     306 |
+------+---------+
| pers |       2 |
+------+---------+
fine_lit
+---------------------+---------+
|                     |   count |
| loc.add.phys        |       2 |
+---------------------+---------+
| loc.adm.nat         |     563 |
+---------------------+---------+
| loc.adm.reg         |     199 |
+---------------------+---------+
| loc.adm.sup         

### hipe2020 FR

#### Dev

In [56]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)

In [57]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_dev_fr_docs))


Path of the TSV file: ./data/v1.0/hipe2020/fr/HIPE-2022-v1.0-hipe2020-dev-fr.tsv 
Number of documents: 43 
Number of entities: {'coarse_lit': 1729, 'coarse_meto': 108, 'fine_lit': 1729, 'fine_meto': 108, 'fine_comp': 724, 'nested': 91} 
Number of tokens: 37953 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     774 |
+------+---------+
| org  |     159 |
+------+---------+
| pers |     679 |
+------+---------+
| prod |      49 |
+------+---------+
| time |      68 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       3 |
+-----+---------+
| org |     105 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| loc.add.phys           |       1 |
+------------------------+---------+
| loc.adm.nat            |     258 |
+------------------------+---------+
| loc.adm.reg            |      73 |
+------------------------+---------+
| loc.adm.sup            |      27 |
+---------

#### Train

In [58]:
# parse the TSV into a list of `HipeDocument` objects
hipe2020_train_fr_docs = parse_tsv(file_path=hipe2020_train_fr_path)

In [59]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=hipe2020_train_fr_docs))


Path of the TSV file: ./data/v1.0/hipe2020/fr/HIPE-2022-v1.0-hipe2020-train-fr.tsv 
Number of documents: 158 
Number of entities: {'coarse_lit': 6926, 'coarse_meto': 451, 'fine_lit': 6926, 'fine_meto': 451, 'fine_comp': 3051, 'nested': 473} 
Number of tokens: 166220 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| comp |       1 |
+------+---------+
| loc  |    3088 |
+------+---------+
| org  |     836 |
+------+---------+
| pers |    2525 |
+------+---------+
| prod |     200 |
+------+---------+
| time |     276 |
+------+---------+
coarse_meto
+-----+---------+
|     |   count |
| loc |       7 |
+-----+---------+
| org |     444 |
+-----+---------+
fine_lit
+------------------------+---------+
|                        |   count |
| comp.name              |       1 |
+------------------------+---------+
| loc.add.elec           |       1 |
+------------------------+---------+
| loc.add.phys           |       3 |
+------------------------+---------+
| l

## letemps

See the [README file](./documentation/README-letemps.md) for detailed information about this dataset.

### File paths

In [60]:
letemps_dev_fr_path = os.path.join(HIPE2022_data_path, "letemps/fr/HIPE-2022-v1.0-letemps-dev-fr.tsv")

In [61]:
letemps_train_fr_path = os.path.join(HIPE2022_data_path, "letemps/fr/HIPE-2022-v1.0-letemps-train-fr.tsv")

#### Dev

In [62]:
# parse the TSV into a list of `HipeDocument` objects
letemps_dev_fr_docs = parse_tsv(file_path=letemps_dev_fr_path)

In [63]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_dev_fr_docs))


Path of the TSV file: ./data/v1.0/letemps/fr/HIPE-2022-v1.0-letemps-dev-fr.tsv 
Number of documents: 51 
Number of entities: {'coarse_lit': 869, 'fine_lit': 869, 'nested': 12} 
Number of tokens: 38649 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |     516 |
+------+---------+
| org  |      41 |
+------+---------+
| pers |     312 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |      16 |
+----------------+---------+
| loc.add        |       2 |
+----------------+---------+
| loc.add.phys   |      21 |
+----------------+---------+
| loc.adm        |      20 |
+----------------+---------+
| loc.adm.nat    |     104 |
+----------------+---------+
| loc.adm.reg    |      45 |
+----------------+---------+
| loc.adm.town   |     270 |
+----------------+---------+
| loc.admin.sup  |       1 |
+----------------+---------+
| loc.phys.geo   |      25 |
+----------------+---------+
| loc.phys.hydro | 

#### Train

In [64]:
# parse the TSV into a list of `HipeDocument` objects
letemps_train_fr_docs = parse_tsv(file_path=letemps_train_fr_path)

In [65]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=letemps_train_fr_docs))


Path of the TSV file: ./data/v1.0/letemps/fr/HIPE-2022-v1.0-letemps-train-fr.tsv 
Number of documents: 414 
Number of entities: {'coarse_lit': 9159, 'fine_lit': 9159, 'nested': 69} 
Number of tokens: 379487 
Entity breakdown by type: coarse_lit
+------+---------+
|      |   count |
| loc  |    5260 |
+------+---------+
| org  |     472 |
+------+---------+
| pers |    3427 |
+------+---------+
fine_lit
+----------------+---------+
|                |   count |
| loc            |     262 |
+----------------+---------+
| loc.add        |       3 |
+----------------+---------+
| loc.add.phys   |     169 |
+----------------+---------+
| loc.adm        |     111 |
+----------------+---------+
| loc.adm.nat    |    1097 |
+----------------+---------+
| loc.adm.reg    |     477 |
+----------------+---------+
| loc.adm.town   |    2713 |
+----------------+---------+
| loc.admin.sup  |      33 |
+----------------+---------+
| loc.oro        |      13 |
+----------------+---------+
| loc.phys   

## topRes19th

See the [README file](./documentation/README-topres19th.md) for detailed information about this dataset.

### File paths

In [66]:
topRes19th_dev_en_path = os.path.join(HIPE2022_data_path, "topRes19th/en/HIPE-2022-v1.0-topRes19th-dev-en.tsv")

In [67]:
topRes19th_train_en_path = os.path.join(HIPE2022_data_path, "topRes19th/en/HIPE-2022-v1.0-topRes19th-train-en.tsv")

#### Dev

In [68]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_dev_en_docs = parse_tsv(file_path=topRes19th_dev_en_path)

In [69]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_dev_en_docs))


Path of the TSV file: ./data/v1.0/topRes19th/en/HIPE-2022-v1.0-topRes19th-dev-en.tsv 
Number of documents: 34 
Number of entities: {'coarse_lit': 237} 
Number of tokens: 11917 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |      19 |
+----------+---------+
| LOC      |     201 |
+----------+---------+
| STREET   |      17 |
+----------+---------+



#### Train

In [70]:
# parse the TSV into a list of `HipeDocument` objects
topRes19th_train_en_docs = parse_tsv(file_path=topRes19th_train_en_path)

In [71]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=topRes19th_train_en_docs))


Path of the TSV file: ./data/v1.0/topRes19th/en/HIPE-2022-v1.0-topRes19th-train-en.tsv 
Number of documents: 309 
Number of entities: {'coarse_lit': 3182} 
Number of tokens: 123983 
Entity breakdown by type: coarse_lit
+----------+---------+
|          |   count |
| BUILDING |     359 |
+----------+---------+
| LOC      |    2580 |
+----------+---------+
| STREET   |     243 |
+----------+---------+



## newseye

See the [README file](./documentation/README-newseye.md) for detailed information about this dataset.

### File paths

In [76]:
newseye_dev_fr_path = os.path.join(HIPE2022_data_path, "newseye/fr/HIPE-2022-v1.0-newseye-dev-fr.tsv")

In [80]:
newseye_train_fr_path = os.path.join(HIPE2022_data_path, "newseye/fr/HIPE-2022-v1.0-newseye-train-fr.tsv")

In [81]:
newseye_dev_de_path = os.path.join(HIPE2022_data_path, "newseye/de/HIPE-2022-v1.0-newseye-dev-de.tsv")

In [82]:
newseye_train_de_path = os.path.join(HIPE2022_data_path, "newseye/de/HIPE-2022-v1.0-newseye-train-de.tsv")

In [95]:
newseye_dev_fi_path = os.path.join(HIPE2022_data_path, "newseye/fi/HIPE-2022-v1.0-newseye-dev-fi.tsv")

In [96]:
newseye_train_fi_path = os.path.join(HIPE2022_data_path, "newseye/fi/HIPE-2022-v1.0-newseye-train-fi.tsv")

In [97]:
newseye_dev_sv_path = os.path.join(HIPE2022_data_path, "newseye/sv/HIPE-2022-v1.0-newseye-dev-sv.tsv")

In [103]:
newseye_train_sv_path = os.path.join(HIPE2022_data_path, "newseye/sv/HIPE-2022-v1.0-newseye-train-sv.tsv")

### newseye FR

#### Dev

In [78]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fr_docs = parse_tsv(file_path=newseye_dev_fr_path)

In [79]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fr_docs))


Path of the TSV file: ./data/v1.0/newseye/fr/HIPE-2022-v1.0-newseye-dev-fr.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 752, 'fine_lit': 3, 'nested': 32} 
Number of tokens: 21727 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      11 |
+-----------+---------+
| LOC       |     335 |
+-----------+---------+
| ORG       |     113 |
+-----------+---------+
| PER       |     293 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       3 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |      18 |
+-----------+---------+
| ORG       |       7 |
+-----------+---------+
| PER       |       6 |
+-----------+---------+



#### Train

In [83]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)

In [84]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fr_docs))


Path of the TSV file: ./data/v1.0/newseye/fr/HIPE-2022-v1.0-newseye-train-fr.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 10423, 'fine_lit': 99, 'nested': 522} 
Number of tokens: 255165 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     200 |
+-----------+---------+
| LOC       |    4055 |
+-----------+---------+
| ORG       |    1285 |
+-----------+---------+
| PER       |    4883 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      99 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       6 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     212 |
+-----------+---------+
| PER       |      41 |
+-----------+---------+



### newseye DE

#### Dev

In [88]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_de_docs = parse_tsv(file_path=newseye_dev_de_path)

In [89]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_de_docs))


Path of the TSV file: ./data/v1.0/newseye/de/HIPE-2022-v1.0-newseye-dev-de.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 539, 'fine_lit': 5, 'nested': 29} 
Number of tokens: 40061 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |       4 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     123 |
+-----------+---------+
| PER       |     149 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       5 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      10 |
+-----+---------+
| ORG |      11 |
+-----+---------+
| PER |       8 |
+-----+---------+



#### Train

In [90]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_de_docs = parse_tsv(file_path=newseye_train_de_path)

In [91]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_de_docs))


Path of the TSV file: ./data/v1.0/newseye/de/HIPE-2022-v1.0-newseye-train-de.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 11397, 'fine_lit': 21, 'nested': 1134} 
Number of tokens: 448243 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      37 |
+-----------+---------+
| LOC       |    5144 |
+-----------+---------+
| ORG       |    3110 |
+-----------+---------+
| PER       |    3106 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      21 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     755 |
+-----------+---------+
| ORG       |     136 |
+-----------+---------+
| PER       |     241 |
+-----------+---------+



### newseye FI

#### Dev

In [99]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fi_docs = parse_tsv(file_path=newseye_dev_fi_path)

In [100]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_fi_docs))


Path of the TSV file: ./data/v1.0/newseye/fi/HIPE-2022-v1.0-newseye-dev-fi.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 223, 'fine_lit': 1, 'nested': 26} 
Number of tokens: 6350 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      12 |
+-----------+---------+
| LOC       |      97 |
+-----------+---------+
| ORG       |      37 |
+-----------+---------+
| PER       |      77 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      22 |
+-----+---------+
| ORG |       2 |
+-----+---------+
| PER |       2 |
+-----+---------+



#### Train

In [104]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_fi_docs = parse_tsv(file_path=newseye_train_fi_path)

In [105]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_fi_docs))


Path of the TSV file: ./data/v1.0/newseye/fi/HIPE-2022-v1.0-newseye-train-fi.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 2146, 'fine_lit': 15, 'nested': 225} 
Number of tokens: 48222 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     126 |
+-----------+---------+
| LOC       |     979 |
+-----------+---------+
| ORG       |     259 |
+-----------+---------+
| PER       |     782 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      15 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       2 |
+-----------+---------+
| LOC       |     171 |
+-----------+---------+
| ORG       |      35 |
+-----------+---------+
| PER       |      17 |
+-----------+---------+



### newseye SV

#### Dev

In [109]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_sv_docs = parse_tsv(file_path=newseye_dev_sv_path)

In [110]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_dev_sv_docs))


Path of the TSV file: ./data/v1.0/newseye/sv/HIPE-2022-v1.0-newseye-dev-sv.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 266, 'fine_lit': 1, 'nested': 9} 
Number of tokens: 6906 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      17 |
+-----------+---------+
| LOC       |     148 |
+-----------+---------+
| ORG       |      17 |
+-----------+---------+
| PER       |      84 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |       1 |
+------------+---------+
nested
+-----+---------+
|     |   count |
| LOC |       7 |
+-----+---------+
| ORG |       1 |
+-----+---------+
| PER |       1 |
+-----+---------+



#### Train

In [111]:
# parse the TSV into a list of `HipeDocument` objects
newseye_train_sv_docs = parse_tsv(file_path=newseye_train_sv_path)

In [112]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=newseye_train_sv_docs))


Path of the TSV file: ./data/v1.0/newseye/sv/HIPE-2022-v1.0-newseye-train-sv.tsv 
Number of documents: 1 
Number of entities: {'coarse_lit': 2141, 'fine_lit': 16, 'nested': 126} 
Number of tokens: 56306 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |     164 |
+-----------+---------+
| LOC       |     985 |
+-----------+---------+
| ORG       |     153 |
+-----------+---------+
| PER       |     839 |
+-----------+---------+
fine_lit
+------------+---------+
|            |   count |
| PER.author |      16 |
+------------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |     100 |
+-----------+---------+
| ORG       |      15 |
+-----------+---------+
| PER       |      10 |
+-----------+---------+



## sonar

See the [README file](./documentation/README-sonar.md) for detailed information about this dataset.

### File paths

In [72]:
sonar_dev_de_path = os.path.join(HIPE2022_data_path, "sonar/de/HIPE-2022-v1.0-sonar-dev-de.tsv")

#### Dev

In [74]:
# parse the TSV into a list of `HipeDocument` objects
sonar_dev_de_docs = parse_tsv(file_path=sonar_dev_de_path)

In [75]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=sonar_dev_de_docs))


Path of the TSV file: ./data/v1.0/sonar/de/HIPE-2022-v1.0-sonar-dev-de.tsv 
Number of documents: 10 
Number of entities: {'coarse_lit': 1316} 
Number of tokens: 29554 
Entity breakdown by type: coarse_lit
+-----+---------+
|     |   count |
| LOC |     606 |
+-----+---------+
| ORG |     215 |
+-----+---------+
| PER |     495 |
+-----+---------+

