In [None]:
import json
from pathlib import Path
import os

ROOT = Path.cwd().parent.parent.resolve()
processed_data_path = ROOT/ "data"/ "processed"
print(f"Processed data path: {processed_data_path}")
# check if it exists
processed_data_path.exists()

### Load files

In [None]:
chapters_file = processed_data_path/ "processed_chapters.jsonl"
episodes_file = processed_data_path/ "processed_episodes.jsonl"
characters_file = processed_data_path/ "processed_characters.jsonl"

# load each file
with open(chapters_file, "r", encoding="utf-8") as f:
    chapters = [json.loads(line) for line in f]

with open(episodes_file, "r", encoding="utf-8") as f:
    episodes = [json.loads(line) for line in f]

with open(characters_file, "r", encoding="utf-8") as f:
    characters = [json.loads(line) for line in f]

print(f"Loaded {len(chapters)} chapters")
print(f"Loaded {len(episodes)} episodes")
print(f"Loaded {len(characters)} characters")

### Combine and verify

In [None]:
all_documents = chapters + episodes + characters
print(f"Total documents: {len(all_documents)}")

In [None]:
# verify same count
total_count = len(characters) + len(episodes) + len(chapters)
assert len(all_documents) == total_count, "Counts do not match!"

In [None]:
# extract metadata fields

metadata_source_count = []

for doc in all_documents:
    metadata = doc.get("metadata", {})
    source = metadata.get("source_type", "unknown")
    metadata_source_count.append(source)

In [None]:
from collections import Counter
source_counter = Counter(metadata_source_count)
print("Document counts by source type:")
for source, count in source_counter.items():
    print(f"{source}: {count}")

### Save into a single file all documents

In [None]:
combined_file_path = processed_data_path / "all_documents.jsonl"

with open(combined_file_path, "w", encoding="utf-8") as f:
    for doc in all_documents:
        f.write(json.dumps(doc) + "\n")

print(f"Combined documents saved to {combined_file_path}")

In [None]:
# try to load that file to verify
with open(combined_file_path, "r", encoding="utf-8") as f:
    combined_docs = [json.loads(line) for line in f]
print(f"Verified loading {len(combined_docs)} combined documents")