# Imports

In [1]:
import os
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset

# Functions

In [2]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# HIPE 2022 Datasets

In [30]:
HIPE2022_data_path = "./data/v1.0/"

## ajmc

### Paths

In [18]:
ajmc_sample_en_path = os.path.join(HIPE2022_data_path, "ajmc/en/HIPE-2022-v1.0-ajmc-sample-en.tsv")

In [19]:
ajmc_sample_de_path = os.path.join(HIPE2022_data_path, "ajmc/de/HIPE-2022-v1.0-ajmc-sample-de.tsv")

### EN sample

In [25]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_sample_en_docs = parse_tsv(file_path=ajmc_sample_en_path)

In [27]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_sample_en_docs))


Path of the TSV file: ./data/v1.0/ajmc/en/HIPE-2022-v1.0-ajmc-sample-en.tsv 
Number of documents: 5 
Number of entities: {'coarse_lit': 153, 'fine_lit': 153} 
Number of tokens: 2187 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       1 |
+-------+---------+
| loc   |       5 |
+-------+---------+
| pers  |      65 |
+-------+---------+
| scope |      51 |
+-------+---------+
| work  |      31 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       1 |
+--------------+---------+
| loc          |       5 |
+--------------+---------+
| pers.author  |      28 |
+--------------+---------+
| pers.editor  |       6 |
+--------------+---------+
| pers.myth    |      30 |
+--------------+---------+
| pers.other   |       1 |
+--------------+---------+
| scope        |      51 |
+--------------+---------+
| work.primlit |      30 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+-

### DE sample

In [28]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_sample_de_docs = parse_tsv(file_path=ajmc_sample_de_path)

In [29]:
# print some basic stats for the TSV dataset 
print(describe_dataset(documents=ajmc_sample_de_docs))


Path of the TSV file: ./data/v1.0/ajmc/de/HIPE-2022-v1.0-ajmc-sample-de.tsv 
Number of documents: 8 
Number of entities: {'coarse_lit': 202, 'fine_lit': 202, 'nested': 7} 
Number of tokens: 2584 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       2 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |      92 |
+-------+---------+
| scope |      79 |
+-------+---------+
| work  |      26 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       2 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      36 |
+--------------+---------+
| pers.myth    |      54 |
+--------------+---------+
| pers.other   |       2 |
+--------------+---------+
| scope        |      79 |
+--------------+---------+
| work.primlit |      25 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+---------+
nested
+-------+---------+
|   

## hipe2020

## letemps

## topres19th

## newseye

## sonar