In [1]:
import pandas as pd
import os
from collections import defaultdict
import json

In [2]:


# List of valid entity classes (ignoring the numbers)
ENTITY_CLASSES = {
    "ART", "DATE", "DOC", "JOB", "LOC", "MISC", "MON",
    "ORG", "PCT", "PERIOD", "PERS", "QUANT", "TIME"
}

def count_all_entities(directory_path):
    entity_counts = defaultdict(int)

    for filename in os.listdir(directory_path):
        if filename.endswith(".ann"):
            filepath = os.path.join(directory_path, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                for line in f:
                    if line.startswith("T"):
                        parts = line.strip().split("\t")
                        if len(parts) >= 2:
                            entity_info = parts[1]
                            entity_type = entity_info.split()[0]
                            if entity_type in ENTITY_CLASSES:
                                entity_counts[entity_type] += 1
    return dict(entity_counts)

In [6]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk"
result = count_all_entities(directory)
print(result)

{'JOB': 485, 'PERS': 280, 'ORG': 100, 'LOC': 91, 'DATE': 45, 'MISC': 21, 'PERIOD': 11, 'ART': 23, 'DOC': 6, 'QUANT': 13, 'TIME': 2}


In [7]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng"
result_1 = count_all_entities(directory)
print(result_1)

{'DATE': 329, 'JOB': 1248, 'PERS': 1003, 'ORG': 1167, 'LOC': 250, 'QUANT': 27, 'MON': 108, 'PCT': 48, 'PERIOD': 77, 'ART': 25, 'DOC': 12, 'MISC': 14, 'TIME': 1}


In [None]:
from collections import Counter

dict1 = {'JOB': 485, 'PERS': 280, 'ORG': 100, 'LOC': 91, 'DATE': 45, 'MISC': 21, 'PERIOD': 11, 'ART': 23, 'DOC': 6, 'QUANT': 13, 'TIME': 2}
dict2 = {'DATE': 329, 'JOB': 1248, 'PERS': 1003, 'ORG': 1167, 'LOC': 250, 'QUANT': 27, 'MON': 108, 'PCT': 48, 'PERIOD': 77, 'ART': 25, 'DOC': 12, 'MISC': 14, 'TIME': 1}

combined = Counter(result_1) + Counter(result)

print(dict(combined))

{'DATE': 374, 'JOB': 1733, 'PERS': 1283, 'ORG': 1267, 'LOC': 341, 'QUANT': 40, 'MON': 108, 'PCT': 48, 'PERIOD': 88, 'ART': 48, 'DOC': 18, 'MISC': 35, 'TIME': 3}


In [9]:
total = 0

for x, val in combined.items():
    total += val

In [10]:
total

5386

## Balanced dataset

In [3]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-balanced_filtering/data/bruk"
result_bruk = count_all_entities(directory)
print(result_bruk)

{'JOB': 1123, 'PERS': 4695, 'ORG': 882, 'LOC': 1711, 'MISC': 434, 'DATE': 596, 'PERIOD': 266, 'PCT': 77, 'MON': 46, 'ART': 339, 'QUANT': 119, 'TIME': 38, 'DOC': 40}


In [4]:
directory = "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-balanced_filtering/data/ng"
result_ng = count_all_entities(directory)
print(result_ng)

{'ORG': 5598, 'DATE': 1825, 'PERIOD': 418, 'MON': 1005, 'ART': 344, 'MISC': 116, 'LOC': 1630, 'PERS': 2823, 'JOB': 2592, 'PCT': 234, 'QUANT': 303, 'DOC': 120, 'TIME': 5}


In [5]:
from collections import Counter

combined = Counter(result_bruk) + Counter(result_ng)

print(dict(combined))

{'JOB': 3715, 'PERS': 7518, 'ORG': 6480, 'LOC': 3341, 'MISC': 550, 'DATE': 2421, 'PERIOD': 684, 'PCT': 311, 'MON': 1051, 'ART': 683, 'QUANT': 422, 'TIME': 43, 'DOC': 160}


In [None]:
ART: 683, +
DATE: 2421, +
DOC: 160, +
JOB: 3715, +
'LOC': 3341, +
'MISC': 550, +
'MON': 1051 +
'ORG': 6480, +
'PCT': 311, +
'PERIOD': 684, +
'PERS': 7518, - -> 7517
'QUANT': 422, +
'TIME': 43, +

## Original

In [29]:
directory = "/Users/linndfors/study/diploma/ner-uk/v2.0/data/bruk"
result = count_all_entities(directory)
print(result)

{'ORG': 782, 'LOC': 1620, 'MISC': 413, 'PERS': 4415, 'PCT': 77, 'MON': 46, 'PERIOD': 255, 'DATE': 551, 'JOB': 638, 'ART': 316, 'QUANT': 106, 'TIME': 36, 'DOC': 34}


In [30]:
directory = "/Users/linndfors/study/diploma/ner-uk/v2.0/data/ng"
result = count_all_entities(directory)
print(result)

{'ORG': 4431, 'DATE': 1496, 'PERIOD': 341, 'MON': 897, 'ART': 319, 'MISC': 102, 'LOC': 1380, 'PERS': 1820, 'JOB': 1344, 'PCT': 186, 'QUANT': 276, 'DOC': 108, 'TIME': 4}
