In [None]:
import sys
sys.path.append('..')

from core import PathwayDocumentStore, chunk_text_semantic, load_csv_data, print_section
import os
import numpy as np

print_section("PATHWAY INGESTION LAYER")

## Step 1: Initialize Document Store

We'll use the Pathway-based document store for efficient retrieval.

In [None]:
# Initialize document store
# In production, use actual embedding model (e.g., sentence-transformers)
document_store = PathwayDocumentStore(
    embedding_model=None,  # Will use simple fallback embeddings
    chunk_size=1000
)

print("✓ Document store initialized")

## Step 2: Load and Ingest Sample Novel

For demonstration, we'll create a sample novel or load from file.

In [None]:
# Create a sample novel for testing
sample_novel = """
The Chronicles of Evermoor: Chapter 1

In the year 1847, young Elizabeth Hartwell arrived at Evermoor Manor. 
The old estate stood on a windswept hill overlooking the moors. Her uncle, 
Lord Edmund Hartwell, had invited her to stay after her parents' death.

Elizabeth was a spirited young woman of twenty-two, with dark hair and 
intelligent eyes. She had been educated in London and spoke three languages.
The servants whispered that she resembled her late mother, Lady Catherine.

One evening in late October, Elizabeth met Thomas Blackwood in the library.
He was her uncle's ward, a mysterious man who rarely spoke of his past.
Thomas had lived at Evermoor for five years, working as Lord Edmund's secretary.

"Miss Hartwell," Thomas said, bowing slightly. "I trust you are settling in well."
Elizabeth noticed his piercing blue eyes and the scar on his left hand.

The autumn of 1847 brought unusual storms to the moors. The villagers spoke 
of strange lights seen near the old abbey ruins. Lord Edmund seemed troubled, 
often locking himself in his study late at night.

In November, Elizabeth discovered a hidden room behind the library's bookshelf.
Inside, she found old letters and a portrait of a woman who looked exactly like her.
The letters were dated 1820, written by her grandmother to an unknown recipient.

As winter approached, Elizabeth began to suspect that her uncle was hiding
something about her family's history. Thomas seemed to know more than he 
would say. One snowy evening in December, everything would change.

The manor held many secrets, and Elizabeth was determined to uncover them all.
"""

# Save sample novel
os.makedirs('../data/novels', exist_ok=True)
with open('../data/novels/evermoor_sample.txt', 'w', encoding='utf-8') as f:
    f.write(sample_novel)

print("✓ Sample novel created")

In [None]:
# Ingest the novel
novel_text = sample_novel
novel_id = "evermoor_sample"

chunk_ids = document_store.ingest_novel(
    novel_text=novel_text,
    novel_id=novel_id,
    metadata={
        'title': 'The Chronicles of Evermoor',
        'author': 'Sample Author',
        'year': 1847
    }
)

print(f"\n✓ Ingested {len(chunk_ids)} chunks")
print(f"  First chunk ID: {chunk_ids[0]}")

## Step 3: Test Retrieval

Test vector similarity search to ensure ingestion worked correctly.

In [None]:
# Test similarity search
query = "Elizabeth Hartwell arrived at the manor"
results = document_store.search_similar(
    query=query,
    top_k=3,
    novel_id=novel_id
)

print(f"\nQuery: '{query}'")
print(f"\nTop {len(results)} results:")
print("=" * 60)

for i, result in enumerate(results, 1):
    print(f"\n{i}. Score: {result['score']:.3f}")
    print(f"   Text: {result['text'][:150]}...")
    print(f"   Chunk ID: {result['chunk_id']}")

## Step 4: Test Keyword Search

In [None]:
# Test keyword search
keywords = ['Thomas', 'library', 'secret']
keyword_results = document_store.keyword_search(
    keywords=keywords,
    novel_id=novel_id
)

print(f"\nKeyword search for: {keywords}")
print(f"Found {len(keyword_results)} results")
print("=" * 60)

for i, result in enumerate(keyword_results[:3], 1):
    print(f"\n{i}. Score: {result['score']}")
    print(f"   Text: {result['text'][:150]}...")

## Step 5: Get Store Statistics

In [None]:
# Get statistics
stats = document_store.get_statistics()

print("\nDocument Store Statistics:")
print("=" * 60)
for key, value in stats.items():
    print(f"{key}: {value}")

## Step 6: Export Index (for persistence)

In [None]:
# Export index
os.makedirs('../results', exist_ok=True)
document_store.export_index('../results/document_index.json')

print("\n✓ Module 1 Complete: Pathway ingestion successful!")