In [1]:
import pandas as pd
import os
from collections import defaultdict
import json

In [6]:
ENTITY_CLASSES = {
    "ART", "DATE", "DOC", "JOB", "LOC", "MISC", "MON",
    "ORG", "PCT", "PERIOD", "PERS", "QUANT", "TIME"
}

def count_all_entities(directory_path):
    entity_counts = defaultdict(int)

    for filename in os.listdir(directory_path):
        if filename.endswith(".ann"):
            filepath = os.path.join(directory_path, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                for line in f:
                    if line.startswith("T"):
                        parts = line.strip().split("\t")
                        if len(parts) >= 2:
                            entity_info = parts[1]
                            entity_type = entity_info.split()[0]
                            if entity_type in ENTITY_CLASSES:
                                entity_counts[entity_type] += 1
    return dict(entity_counts)

In [13]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng"
result_ng_swapped = count_all_entities(directory)
result_ng_swapped = dict(sorted(result_ng_swapped.items()))
print(result_ng_swapped)

{'ART': 25, 'DATE': 329, 'DOC': 12, 'JOB': 1248, 'LOC': 250, 'MISC': 14, 'MON': 108, 'ORG': 1167, 'PCT': 48, 'PERIOD': 77, 'PERS': 1003, 'QUANT': 27, 'TIME': 1}


In [15]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk"
result_bruk_swapped = count_all_entities(directory)
result_bruk_swapped = dict(sorted(result_bruk_swapped.items()))
print(result_bruk_swapped)

{'ART': 23, 'DATE': 45, 'DOC': 6, 'JOB': 485, 'LOC': 91, 'MISC': 21, 'ORG': 100, 'PERIOD': 11, 'PERS': 280, 'QUANT': 13, 'TIME': 2}


In [14]:
from collections import Counter

combined = Counter(result_bruk_swapped) + Counter(result_ng_swapped)

print(dict(combined))

{'ART': 48, 'DATE': 374, 'DOC': 18, 'JOB': 1733, 'LOC': 341, 'MISC': 35, 'ORG': 1267, 'PERIOD': 88, 'PERS': 1283, 'QUANT': 40, 'TIME': 3, 'MON': 108, 'PCT': 48}


In [17]:
total = 0

for x, val in result_bruk_swapped.items():
    total += val

print(total)

1077


In [18]:
total = 0

for x, val in result_ng_swapped.items():
    total += val

print(total)

4309


In [16]:
total = 0

for x, val in combined.items():
    total += val

print(total)

5386


## Balanced dataset

In [20]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-balanced_filtering/data/ng"
result_ng = count_all_entities(directory)
result_ng = dict(sorted(result_ng.items()))
print(result_ng)

{'ART': 344, 'DATE': 1825, 'DOC': 120, 'JOB': 2592, 'LOC': 1630, 'MISC': 116, 'MON': 1005, 'ORG': 5598, 'PCT': 234, 'PERIOD': 418, 'PERS': 2823, 'QUANT': 303, 'TIME': 5}


In [21]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-balanced_filtering/data/bruk"
result_bruk = count_all_entities(directory)
result_bruk = dict(sorted(result_bruk.items()))
print(result_bruk)

{'ART': 339, 'DATE': 596, 'DOC': 40, 'JOB': 1123, 'LOC': 1711, 'MISC': 434, 'MON': 46, 'ORG': 882, 'PCT': 77, 'PERIOD': 266, 'PERS': 4695, 'QUANT': 119, 'TIME': 38}


In [22]:
from collections import Counter

combined = Counter(result_bruk) + Counter(result_ng)

print(dict(combined))

{'ART': 683, 'DATE': 2421, 'DOC': 160, 'JOB': 3715, 'LOC': 3341, 'MISC': 550, 'MON': 1051, 'ORG': 6480, 'PCT': 311, 'PERIOD': 684, 'PERS': 7518, 'QUANT': 422, 'TIME': 43}


In [24]:
total = 0

for x, val in result_ng.items():
    total += val

print(total)

17013


In [25]:
total = 0

for x, val in result_bruk.items():
    total += val

print(total)

10366


In [23]:
total = 0

for x, val in combined.items():
    total += val

print(total)

27379


## Original

In [29]:
directory = "/Users/linndfors/study/diploma/ner-uk/v2.0/data/bruk"
result = count_all_entities(directory)
print(result)

{'ORG': 782, 'LOC': 1620, 'MISC': 413, 'PERS': 4415, 'PCT': 77, 'MON': 46, 'PERIOD': 255, 'DATE': 551, 'JOB': 638, 'ART': 316, 'QUANT': 106, 'TIME': 36, 'DOC': 34}


In [30]:
directory = "/Users/linndfors/study/diploma/ner-uk/v2.0/data/ng"
result = count_all_entities(directory)
print(result)

{'ORG': 4431, 'DATE': 1496, 'PERIOD': 341, 'MON': 897, 'ART': 319, 'MISC': 102, 'LOC': 1380, 'PERS': 1820, 'JOB': 1344, 'PCT': 186, 'QUANT': 276, 'DOC': 108, 'TIME': 4}
