# Imports

In [1]:
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset

# Functions

In [2]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# English sample

## Read TSV

In [8]:
newseye_dev_fr_path = "../data/newseye/v0.9/fr/HIPE-2022-newseye-v0.9-dev-fr.tsv"

In [9]:
# parse the TSV into a list of `HipeDocument` objects
newseye_dev_fr_docs = parse_tsv(file_path=newseye_dev_fr_path)

In [10]:
# number of documents in the ajmc EN sample file
len(newseye_dev_fr_docs)

699

## Basic stats

In [11]:
print(describe_dataset(documents=newseye_dev_fr_docs))


Path of the TSV file: ../data/newseye/v0.9/fr/HIPE-2022-newseye-v0.9-dev-fr.tsv 
Number of documents: 699 
Number of entities: {'coarse_lit': 752, 'nested': 32} 
Number of tokens: 21727 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |      11 |
+-----------+---------+
| LOC       |     335 |
+-----------+---------+
| ORG       |     113 |
+-----------+---------+
| PER       |     293 |
+-----------+---------+
nested
+-----------+---------+
|           |   count |
| HumanProd |       1 |
+-----------+---------+
| LOC       |      18 |
+-----------+---------+
| ORG       |       7 |
+-----------+---------+
| PER       |       6 |
+-----------+---------+



## Inspect sample entities

Let's now have a look at a bunch of random entities for each entity type in this dataset.

In [12]:
# first, we need to gather all entities from documents in the dataset
all_entities = collect_entities(newseye_dev_fr_docs)

In [13]:
random.sample(all_entities['coarse_lit'], 5)

[[ORG] République (Q70802),
 [LOC] Auray  (Q62958),
 [LOC] Memel (Q161334),
 [ORG] P.T.T.  (Q3399837),
 [PER] Mes Jacquemont  (NIL)]

# German sample

## Read TSV

In [19]:
newseye_dev_de_path =  "../data/newseye/v0.9/de/HIPE-2022-newseye-v0.9-dev-de.tsv"

In [20]:
newseye_dev_de_docs = parse_tsv(file_path=newseye_dev_de_path)

## Basic stats

In [21]:
print(describe_dataset(documents=newseye_dev_de_docs))


Path of the TSV file: ../data/newseye/v0.9/de/HIPE-2022-newseye-v0.9-dev-de.tsv 
Number of documents: 1124 
Number of entities: {'coarse_lit': 539, 'nested': 29} 
Number of tokens: 40061 
Entity breakdown by type: coarse_lit
+-----------+---------+
|           |   count |
| HumanProd |       4 |
+-----------+---------+
| LOC       |     263 |
+-----------+---------+
| ORG       |     123 |
+-----------+---------+
| PER       |     149 |
+-----------+---------+
nested
+-----+---------+
|     |   count |
| LOC |      10 |
+-----+---------+
| ORG |      11 |
+-----+---------+
| PER |       8 |
+-----+---------+



Let's now have a look at a bunch of random entities for each entity type in this dataset.

In [47]:
# first, we need to gather all entities from documents in the dataset
all_entities = collect_entities(newseye_dev_de_docs)

In [44]:
random.sample(all_entities['coarse_lit'], 5)

[[scope] 3. 59) (None),
 [scope] 1. 15 § 13 (None),
 [scope] 2. 557 f.: (None),
 [work] Pk.  (Q1415903),
 [pers] Meineke  (Q77628)]

In [45]:
random.sample(all_entities['fine_lit'], 5)

[[pers.author] Aesch.  (Q40939),
 [pers.author] Sophocles  (Q11950683),
 [pers.author] Sophocles  (Q11950683),
 [scope] p. 221) (None),
 [work.primlit] Little Iliad (Q2087365)]