In [1]:
from pathlib import Path
import json
import spacy
from tqdm import tqdm
from collections import defaultdict

In [2]:
# -----------------------------
# Paths
# -----------------------------
PROJECT_ROOT = Path.cwd().parent
corrected_json = PROJECT_ROOT / "data" / "processed" / "HP_diary_entries.json"
entities_output = PROJECT_ROOT / "outputs" / "entities" / "HP_ner.json"
unique_entities_output = PROJECT_ROOT / "outputs" / "entities" / "HP_unique_entities.json"

# Ensure output folder exists
entities_output.parent.mkdir(parents=True, exist_ok=True)

# -----------------------------
# Load corrected diary
# -----------------------------
with open(corrected_json, "r", encoding="utf-8") as f:
    diary_entries = json.load(f)

print(f"Loaded {len(diary_entries)} diary entries")

# -----------------------------
# Load spaCy model
# -----------------------------
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded")

# -----------------------------
# Function to extract entities
# -----------------------------
def extract_entities(text):
    """
    Extract PERSON, ORG, GPE entities
    """
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "EVENT", "GPE"]:
            entities.append({
                "text": ent.text,
                "label": ent.label_
            })
    return entities

# -----------------------------
# Run NER over all entries
# -----------------------------
for entry in tqdm(diary_entries, desc="Extracting entities"):
    entry["entities"] = extract_entities(entry["text"])

# Save diary entries with entities
with open(entities_output, "w", encoding="utf-8") as f:
    json.dump(diary_entries, f, indent=2)

print(f"Diary entries with entities saved: {entities_output}")

# -----------------------------
# Build a deduplicated entity list
# -----------------------------
entity_counts = defaultdict(lambda: {"label": None, "count": 0})

for entry in diary_entries:
    for ent in entry["entities"]:
        key = ent["text"].strip()
        entity_counts[key]["label"] = ent["label"]
        entity_counts[key]["count"] += 1

# Convert to a list for easy JSON export
unique_entities = [{"name": k, "label": v["label"], "count": v["count"]} 
                   for k, v in entity_counts.items()]

# Save deduplicated entities
with open(unique_entities_output, "w", encoding="utf-8") as f:
    json.dump(unique_entities, f, indent=2)

print(f"Unique entity list saved: {unique_entities_output}")
print(f"Total unique entities: {len(unique_entities)}")

Loaded 20 diary entries
spaCy model loaded


Extracting entities: 100%|██████████████████████| 20/20 [00:02<00:00,  8.53it/s]

Diary entries with entities saved: /Users/joenockels/douglass-kg/outputs/entities/HP_ner.json
Unique entity list saved: /Users/joenockels/douglass-kg/outputs/entities/HP_unique_entities.json
Total unique entities: 164





In [None]:
# use fuzzy matching to reduce the amount of manual normalisation / correction, uses character len to find similar
# variants and then maps them to a singular canonical name, ready for manual correction and wikidata linking

!pip install rapidfuzz

In [3]:
from rapidfuzz import process, fuzz
import json
from pathlib import Path

# -----------------------------
# Paths
# -----------------------------
PROJECT_ROOT = Path.cwd().parent
unique_entities_file = PROJECT_ROOT / "outputs" / "entities" / "HP_unique_entities.json"
normalized_entities_file = PROJECT_ROOT / "outputs" / "entities" / "HP_normalized_entities.json"

# Load unique entities
with open(unique_entities_file, "r", encoding="utf-8") as f:
    unique_entities = json.load(f)

# -----------------------------
# Group similar entity names
# -----------------------------
threshold = 85  # similarity threshold for fuzzy matching
canonical_map = {}  # maps variant -> canonical

# Sort entities by length descending (longest first to keep more complete names)
unique_entities_sorted = sorted(unique_entities, key=lambda x: len(x["name"]), reverse=True)

for entity in unique_entities_sorted:
    name = entity["name"]
    
    # Skip if already mapped
    if name in canonical_map:
        continue
    
    # Compare with all other entities
    matches = process.extract(
        name, 
        [e["name"] for e in unique_entities_sorted if e["name"] != name], 
        scorer=fuzz.token_sort_ratio, 
        score_cutoff=threshold
    )
    
    # Map all variants to this canonical name
    for match_name, score, _ in matches:
        canonical_map[match_name] = name

# Every name maps to itself if no variant found
for entity in unique_entities_sorted:
    canonical_map.setdefault(entity["name"], entity["name"])

# -----------------------------
# Build normalized list
# -----------------------------
normalized_entities = []
for entity in unique_entities_sorted:
    normalized_entities.append({
        "canonical_name": canonical_map[entity["name"]],
        "original_name": entity["name"],
        "label": entity["label"],
        "count": entity["count"]
    })

# Save
with open(normalized_entities_file, "w", encoding="utf-8") as f:
    json.dump(normalized_entities, f, indent=2)

print(f"Normalized entities saved: {normalized_entities_file}")

Normalized entities saved: /Users/joenockels/douglass-kg/outputs/entities/HP_normalized_entities.json


In [4]:
# covert to .csv file for easy data handling, editing 

import json
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
normalized_json = PROJECT_ROOT / "outputs" / "entities" / "HP_normalized_entities.json"
csv_path = PROJECT_ROOT / "outputs" / "entities" / "HP_normalized_entities.csv"

# Load JSON
with open(normalized_json, "r", encoding="utf-8") as f:
    entities = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(entities)

# Save as CSV
df.to_csv(csv_path, index=False)
print(f"Normalized entities exported to CSV: {csv_path}")

Normalized entities exported to CSV: /Users/joenockels/douglass-kg/outputs/entities/HP_normalized_entities.csv


In [None]:
# extract standard_dates to aid the mentions table construction 

import pandas as pd

entries_df = pd.DataFrame(diary_entries)

# Convert to datetime for safety
entries_df["standard_date"] = pd.to_datetime(entries_df["standard_date"])

# Sort chronologically
entries_df = entries_df.sort_values("standard_date").reset_index(drop=True)

entries_df.head()