# Testing HPS Analyzer

This notebook tests the HPS analyzer with your existing data.

In [2]:
# Import the HPS analyzer
from hps_analyzer import HPSAnalyzer
import outlines
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path

In [3]:
# Load your model (same as before)
model_path = "/gpfs1/llm/llama-3.2-hf/Meta-Llama-3.2-3B-Instruct"

model = outlines.models.transformers(
    model_path,
    device="cuda"
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


In [5]:
# Initialize the HPS analyzer
analyzer = HPSAnalyzer(
    model=model,
    tokenizer=tokenizer,
    token_max=1500,  # Adjust based on your model's capabilities
    chunk_size=300   # Words per chunk for long texts
)

print("HPS Analyzer initialized!")

HPS Analyzer initialized!


In [6]:
# Load your existing data
DATA_DIR = Path("./data/gleick2011/pages")

# Load page 20 content
with open(DATA_DIR / "20.md", 'r', encoding='utf-8') as f:
    page_20_content = f.read()

print(f"Loaded page 20: {len(page_20_content)} characters")
print(f"First 300 characters: {page_20_content[:300]}...")

Loaded page 20: 3132 characters
First 300 characters: <p>The Information</p>
<p>horizon. Whether Ong would have seen cyberspace as fundamentally oral or literary, he would surely have recognized it as transformative: not just a revitalization of older forms, not just an amplification, but something wholly new. He might have sensed a coming discontinuit...


In [7]:
# Analyze the text
print("\n" + "="*60)
print("ANALYZING PAGE 20 FROM GLEICK 2011")
print("="*60)

result = analyzer.analyze_chapter(
    text=page_20_content,
    format_output=True,  # This will print the nice formatted output
    use_chunking=False   # Page 20 is probably short enough
)

print("\nAnalysis completed!")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



ANALYZING PAGE 20 FROM GLEICK 2011



Analysis completed!


In [8]:
# Access specific results programmatically
print("\n" + "="*40)
print("EXTRACTED INFORMATION SUMMARY")
print("="*40)

print(f"\nAuthors found: {len(result.authors)}")
for author in result.authors:
    print(f"  - {author.name} ({author.role.value}, {author.time_period})")

print(f"\nWorks found: {len(result.works)}")
for work in result.works:
    print(f"  - {work.title} by {work.author} ({work.work_type.value})")

print(f"\nTopics found: {len(result.topics)}")
for topic in result.topics:
    print(f"  - {topic.name} ({topic.field.value})")

print(f"\nTime periods: {result.time_periods_covered}")
print(f"Locations: {result.geographical_locations}")


EXTRACTED INFORMATION SUMMARY

Authors found: 4
  - W.J.T. Ong (HISTORIAN, not specified)
  - Jonathan Miller (PHILOSOPHER, not specified)
  - Plato (PHILOSOPHER, not specified)
  - Socrates (PHILOSOPHER, not specified)

Works found: 3
  - The Information by W.J.T. Ong (UNKNOWN)
  - no specific title mentioned by Jonathan Miller (UNKNOWN)
  - The Republic by Plato (UNKNOWN)

Topics found: 4
  - Information Horizon (UNKNOWN)
  - Literacy (UNKNOWN)
  - Philosophy of Language (UNKNOWN)
  - Philosophy of Technology (UNKNOWN)

Time periods: ['Ancient Greece', 'Modern Era']
Locations: ['Ancient Greece', 'Modern Era']


In [9]:
# Test with multiple pages (if you want to try a longer text)
# Load and combine multiple pages
combined_text = ""
pages_to_combine = ["20.md", "21.md"]  # Add more pages as needed

for page_file in pages_to_combine:
    if (DATA_DIR / page_file).exists():
        with open(DATA_DIR / page_file, 'r', encoding='utf-8') as f:
            combined_text += f"\n\n=== PAGE {page_file} ===\n\n" + f.read()

print(f"Combined text length: {len(combined_text)} characters")
print(f"Word count: approximately {len(combined_text.split())} words")

if len(combined_text.split()) > 500:
    print("\n[yellow]Text is long - will use chunking...[/]")
    
    # Analyze with chunking enabled
    combined_result = analyzer.analyze_chapter(
        text=combined_text,
        format_output=True,
        use_chunking=True
    )
else:
    print("Text is short enough for single analysis")
    combined_result = analyzer.analyze_chapter(combined_text)

Combined text length: 6328 characters
Word count: approximately 1023 words

[yellow]Text is long - will use chunking...[/]


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


ValidationError: 1 validation error for ChapterAnalysis
complexity_score
  Input should be less than or equal to 1 [type=less_than_equal, input_value=8, input_type=int]
    For further information visit https://errors.pydantic.dev/2.11/v/less_than_equal

In [None]:
# Export results to JSON for further processing
import json

# Convert result to dict and save
result_dict = result.model_dump()

with open("hps_analysis_page20.json", "w", encoding="utf-8") as f:
    json.dump(result_dict, f, indent=2, ensure_ascii=False)

print("Results saved to 'hps_analysis_page20.json'")
print(f"\nAnalysis contains:")
print(f"  - {len(result.authors)} authors")
print(f"  - {len(result.works)} works")
print(f"  - {len(result.topics)} topics")
print(f"  - Complexity score: {result.complexity_score:.2f}")