# Testing Incremental HPS Analyzer

This notebook tests the incremental knowledge-building system that:
- Remembers entities across chunks
- Updates entities with new information
- Tracks relationships and mentions

In [1]:
from incremental_hps_analyzer import IncrementalHPSAnalyzer, KnowledgeBase
import outlines
from transformers import AutoTokenizer
from pathlib import Path
import json

  \   Later: "Ong said"  /


In [2]:
# Load your model
model_path = "/gpfs1/llm/llama-3.2-hf/Meta-Llama-3.2-3B-Instruct"

model = outlines.models.transformers(
    model_path,
    device="cuda"
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


In [3]:
# Initialize the incremental analyzer
analyzer = IncrementalHPSAnalyzer(
    model=model,
    tokenizer=tokenizer,
    token_max=900,
    chunk_size=400  # Smaller chunks for better incremental tracking
)

print("Incremental HPS Analyzer initialized!")
print(f"Starting with {len(analyzer.knowledge_base.entities)} known entities")

Incremental HPS Analyzer initialized!
Starting with 0 known entities


In [4]:
# Test with multiple pages to see incremental learning
DATA_DIR = Path("./data/gleick2011/pages")

# Load several pages in order
pages_to_analyze = ["19.md", "20.md", "21.md", "22.md", "23.md", "24.md", "25.md", "26.md"]  # Add more as needed
combined_text = ""

for page_file in pages_to_analyze:
    if (DATA_DIR / page_file).exists():
        with open(DATA_DIR / page_file, 'r', encoding='utf-8') as f:
            page_content = f.read()
            combined_text += f"\n\n=== PAGE {page_file} ===\n\n" + page_content
            print(f"Loaded {page_file}: {len(page_content)} characters")

print(f"\nTotal text length: {len(combined_text)} characters")
print(f"Estimated word count: {len(combined_text.split())} words")

Loaded 19.md: 2275 characters
Loaded 20.md: 3132 characters
Loaded 21.md: 3152 characters
Loaded 22.md: 2962 characters
Loaded 23.md: 3042 characters
Loaded 24.md: 3135 characters
Loaded 25.md: 3097 characters
Loaded 26.md: 3854 characters

Total text length: 24825 characters
Estimated word count: 3868 words


In [None]:
# Run incremental analysis
print("\n" + "="*60)
print("STARTING INCREMENTAL ANALYSIS")
print("="*60)

# This will process chunks sequentially, building knowledge as it goes
results = analyzer.analyze_text_incrementally(
    text=combined_text,
    format_output=True  # Shows progress as it builds knowledge
)

print(f"\nCompleted analysis of {len(results)} chunks")


STARTING INCREMENTAL ANALYSIS


In [None]:
# Examine the knowledge base that was built
kb = analyzer.knowledge_base

print("\n" + "="*50)
print("KNOWLEDGE BASE ANALYSIS")
print("="*50)

print(f"\nTotal entities discovered: {len(kb.entities)}")

# Group by type
people = [e for e in kb.entities.values() if e.entity_type == "person"]
works = [e for e in kb.entities.values() if e.entity_type == "work"]
topics = [e for e in kb.entities.values() if e.entity_type == "topic"]

print(f"\n📚 People: {len(people)}")
for person in people[:5]:  # Show first 5
    print(f"  • {person.canonical_name}")
    print(f"    Mentions: {len(person.mentions)}")
    print(f"    Name variants: {list(person.name_variants)}")
    if person.attributes:
        print(f"    Key info: {dict(list(person.attributes.items())[:3])}")
    print()

print(f"\n📖 Works: {len(works)}")
for work in works[:3]:
    print(f"  • {work.canonical_name}")
    print(f"    Mentions: {len(work.mentions)}")
    if work.attributes:
        print(f"    Info: {dict(list(work.attributes.items())[:2])}")
    print()

print(f"\n🧠 Topics: {len(topics)}")
for topic in topics[:3]:
    print(f"  • {topic.canonical_name}")
    print(f"    Mentions: {len(topic.mentions)}")
    print()

In [None]:
# Demonstrate incremental knowledge building
print("\n" + "="*50)
print("INCREMENTAL KNOWLEDGE EXAMPLE")
print("="*50)

# Find an entity that appears multiple times
multi_mention_entities = [e for e in kb.entities.values() if len(e.mentions) > 1]

if multi_mention_entities:
    # Show the most mentioned entity
    top_entity = max(multi_mention_entities, key=lambda e: len(e.mentions))
    
    print(f"\n🎯 FOCUS: {top_entity.canonical_name}")
    print(f"Total mentions: {len(top_entity.mentions)}")
    print(f"Name variants: {list(top_entity.name_variants)}")
    print(f"Final attributes: {top_entity.attributes}")
    
    print("\nMention progression:")
    for i, mention in enumerate(top_entity.mentions, 1):
        print(f"  {i}. {mention.chunk_id}: '{mention.mention_text}'")
        print(f"     Context: {mention.context[:100]}...")
        print()
else:
    print("No entities found with multiple mentions in this sample.")

In [None]:
# Save the knowledge base for future use
analyzer.save_knowledge_base("gleick_knowledge_base.json")

# You can also export to examine the structure
kb_data = kb.export_knowledge_base()

print("Knowledge base statistics:")
print(f"  Total entities: {kb_data['total_entities']}")
print(f"  Chunks processed: {kb_data['total_chunks_processed']}")
print(f"  Average mentions per entity: {sum(e['mention_count'] for e in kb_data['entities'].values()) / len(kb_data['entities']):.1f}")

# Show the structure
print("\nSample entity structure:")
if kb_data['entities']:
    sample_key = next(iter(kb_data['entities']))
    sample_entity = kb_data['entities'][sample_key]
    print(json.dumps(sample_entity, indent=2)[:500] + "...")

In [None]:
# Test name matching (the key feature)
print("\n" + "="*50)
print("NAME MATCHING TEST")
print("="*50)

# Test how well it matches different name forms
test_names = [
    "Walter J. Ong",
    "Ong", 
    "Walter Ong",
    "W. J. Ong",
    "Jonathan Miller",
    "Miller",
    "Plato",
    "Socrates"
]

print("Testing name resolution:")
for test_name in test_names:
    entity = kb.find_entity_by_name(test_name)
    if entity:
        print(f"  '{test_name}' -> {entity.canonical_name} ({len(entity.mentions)} mentions)")
    else:
        print(f"  '{test_name}' -> NOT FOUND")