# Imports

In [32]:
import random
from typing import List, Dict
from hipe_commons.helpers.tsv import parse_tsv, ENTITY_TYPES, HipeDocument, HipeEntity
from hipe_commons.stats import describe_dataset

# Functions

In [42]:
def collect_entities(documents: List[HipeDocument]) -> Dict[str, List[HipeEntity]]:
    """Simple function to gather all entities from documents in a dataset, divided by type.

    :param documents: Input documents in HIPE format
    :type documents: List[HipeDocument]
    :return: A list of `HipeEntity` objects
    :rtype: Dict[str, List[HipeEntity]]
    """
    all_entities = {}

    for doc in ajmc_sample_en_docs:
        for e_type in ENTITY_TYPES:
            
            if e_type in doc.entities:

                if e_type not in all_entities:
                    all_entities[e_type] = []

                all_entities[e_type] += doc.entities[e_type]
    return all_entities

# English sample

## Read TSV

In [5]:
ajmc_sample_en_path = "../data/ajmc/v0.1/en/HIPE-2022-ajmc-v0.1-sample-en.tsv"

In [6]:
# parse the TSV into a list of `HipeDocument` objects
ajmc_sample_en_docs = parse_tsv(file_path=ajmc_sample_en_path)

In [46]:
# number of documents in the ajmc EN sample file
len(ajmc_sample_en_docs)

5

## Basic stats

In [10]:
print(describe_dataset(documents=ajmc_sample_en_docs))


Path of the TSV file: ../data/ajmc/v0.1/en/HIPE-2022-ajmc-v0.1-sample-en.tsv 
Number of documents: 5 
Number of entities: {'coarse_lit': 153, 'fine_lit': 153} 
Number of tokens: 2187 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       1 |
+-------+---------+
| loc   |       5 |
+-------+---------+
| pers  |      65 |
+-------+---------+
| scope |      51 |
+-------+---------+
| work  |      31 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       1 |
+--------------+---------+
| loc          |       5 |
+--------------+---------+
| pers.author  |      28 |
+--------------+---------+
| pers.editor  |       6 |
+--------------+---------+
| pers.myth    |      30 |
+--------------+---------+
| pers.other   |       1 |
+--------------+---------+
| scope        |      51 |
+--------------+---------+
| work.primlit |      30 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+

## Inspect sample entities

Let's now have a look at a bunch of random entities for each entity type in this dataset.

In [48]:
# first, we need to gather all entities from documents in the dataset
all_entities = collect_entities(ajmc_sample_en_docs)

In [40]:
random.sample(all_entities['coarse_lit'], 5)

[[scope] Append.  (None),
 [loc] Salamis (Q202422),
 [scope] 411  (None),
 [scope] § 9.p. 12, 2 (None),
 [scope] 27 (None)]

In [41]:
random.sample(all_entities['fine_lit'], 5)

[[scope] 27 (None),
 [work.primlit] “42.  (Q241077),
 [pers.other] Photius (Q243187),
 [work.primlit] Miad  (Q8275),
 [work.primlit] Trach.  (Q945342)]

# German sample

## Read TSV

In [12]:
ajmc_sample_de_path = "../data/ajmc/v0.1/de/HIPE-2022-ajmc-v0.1-sample-de.tsv"

In [13]:
ajmc_sample_de_docs = parse_tsv(file_path=ajmc_sample_de_path)

## Basic stats

In [15]:
print(describe_dataset(documents=ajmc_sample_de_docs))


Path of the TSV file: ../data/ajmc/v0.1/de/HIPE-2022-ajmc-v0.1-sample-de.tsv 
Number of documents: 8 
Number of entities: {'coarse_lit': 202, 'fine_lit': 202, 'nested': 7} 
Number of tokens: 2584 
Entity breakdown by type: coarse_lit
+-------+---------+
|       |   count |
| date  |       2 |
+-------+---------+
| loc   |       3 |
+-------+---------+
| pers  |      92 |
+-------+---------+
| scope |      79 |
+-------+---------+
| work  |      26 |
+-------+---------+
fine_lit
+--------------+---------+
|              |   count |
| date         |       2 |
+--------------+---------+
| loc          |       3 |
+--------------+---------+
| pers.author  |      36 |
+--------------+---------+
| pers.myth    |      54 |
+--------------+---------+
| pers.other   |       2 |
+--------------+---------+
| scope        |      79 |
+--------------+---------+
| work.primlit |      25 |
+--------------+---------+
| work.seclit  |       1 |
+--------------+---------+
nested
+-------+---------+
|  

Let's now have a look at a bunch of random entities for each entity type in this dataset.

In [47]:
# first, we need to gather all entities from documents in the dataset
all_entities = collect_entities(ajmc_sample_de_docs)

In [44]:
random.sample(all_entities['coarse_lit'], 5)

[[scope] 3. 59) (None),
 [scope] 1. 15 § 13 (None),
 [scope] 2. 557 f.: (None),
 [work] Pk.  (Q1415903),
 [pers] Meineke  (Q77628)]

In [45]:
random.sample(all_entities['fine_lit'], 5)

[[pers.author] Aesch.  (Q40939),
 [pers.author] Sophocles  (Q11950683),
 [pers.author] Sophocles  (Q11950683),
 [scope] p. 221) (None),
 [work.primlit] Little Iliad (Q2087365)]