In [None]:
import sys
sys.path.append('..')

from core import (
    PathwayDocumentStore, HierarchicalNarrativeMemory,
    ClaimExtractor, ConstraintBuilder, MultiHopRetriever,
    CausalReasoningEngine, TemporalReasoningEngine,
    InconsistencyScorer, ConsistencyClassifier,
    load_csv_data, save_results, print_section
)
import os
import pandas as pd
from tqdm import tqdm

print_section("COMPLETE PIPELINE")

## Configuration

In [None]:
# Configuration
CONFIG = {
    'data_dir': '../data',
    'novels_dir': '../data/novels',
    'results_dir': '../results',
    'chunk_size': 1000,
    'max_hops': 3,
    'top_k_evidence': 5,
    'inconsistency_threshold': 0.5
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## Initialize Components

In [None]:
# Initialize all components
document_store = PathwayDocumentStore(embedding_model=None, chunk_size=CONFIG['chunk_size'])
memory = HierarchicalNarrativeMemory()
claim_extractor = ClaimExtractor()
constraint_builder = ConstraintBuilder()
retriever = None  # Will initialize after ingestion
scorer = InconsistencyScorer()
classifier = ConsistencyClassifier(threshold=CONFIG['inconsistency_threshold'])

print("✓ All components initialized")

## Load Data

In [None]:
# Load train/test data
train_path = os.path.join(CONFIG['data_dir'], 'train.csv')
test_path = os.path.join(CONFIG['data_dir'], 'test.csv')

if os.path.exists(train_path):
    train_data = load_csv_data(train_path)
    print(f"✓ Loaded {len(train_data)} training examples")
else:
    train_data = []
    print("⚠ No train.csv found")

if os.path.exists(test_path):
    test_data = load_csv_data(test_path)
    print(f"✓ Loaded {len(test_data)} test examples")
else:
    test_data = []
    print("⚠ No test.csv found")

## Pipeline Function

In [None]:
def process_example(story_id, novel_file, backstory, novel_cache={}):
    """
    Process a single example through the pipeline.
    
    Returns:
        dict with prediction, confidence, rationale
    """
    try:
        # Step 1: Load/ingest novel (with caching)
        novel_path = os.path.join(CONFIG['novels_dir'], novel_file)
        
        if novel_file not in novel_cache:
            with open(novel_path, 'r', encoding='utf-8') as f:
                novel_text = f.read()
            
            # Ingest
            chunk_ids = document_store.ingest_novel(
                novel_text=novel_text,
                novel_id=novel_file,
                metadata={'filename': novel_file}
            )
            novel_cache[novel_file] = True
        
        # Step 2: Build memory (simplified for speed)
        chunks = []
        for chunk_id, doc in document_store.documents.items():
            if document_store.chunk_to_doc.get(chunk_id) == novel_file:
                chunks.append({
                    'chunk_id': chunk_id,
                    'text': doc.text,
                    'metadata': doc.metadata
                })
        
        local_memory = HierarchicalNarrativeMemory()
        local_memory.extract_narrative_from_chunks(chunks, novel_file)
        
        # Step 3: Extract claims
        claims = claim_extractor.extract_claims(backstory)
        
        # Step 4: Build constraints
        constraint_graph = constraint_builder.build_graph(claims)
        
        # Step 5: Retrieve evidence
        local_retriever = MultiHopRetriever(document_store, max_hops=CONFIG['max_hops'])
        evidence_map = local_retriever.retrieve_for_claims(
            claims=claims,
            novel_id=novel_file,
            top_k_per_claim=CONFIG['top_k_evidence']
        )
        
        # Step 6: Reasoning
        causal_engine = CausalReasoningEngine(local_memory, constraint_graph)
        temporal_engine = TemporalReasoningEngine(local_memory, constraint_graph)
        
        temporal_engine.build_timeline(claims, evidence_map)
        temporal_conflicts = temporal_engine.check_temporal_consistency(claims, evidence_map)
        causal_conflicts = causal_engine.check_causal_consistency(claims, evidence_map)
        
        # Step 7: Scoring
        score_result = scorer.score_backstory(
            claims=claims,
            evidence_map=evidence_map,
            temporal_conflicts=temporal_conflicts,
            causal_conflicts=causal_conflicts,
            memory=local_memory
        )
        
        # Step 8: Classification
        classification = classifier.classify(
            inconsistency_score=score_result['overall_inconsistency'],
            temporal_conflicts=temporal_conflicts,
            causal_conflicts=causal_conflicts,
            evidence_map=evidence_map,
            claims=claims
        )
        
        return {
            'story_id': story_id,
            'prediction': classification['prediction'],
            'confidence': classification['confidence'],
            'rationale': classification['rationale']
        }
    
    except Exception as e:
        print(f"Error processing {story_id}: {str(e)}")
        return {
            'story_id': story_id,
            'prediction': 0,
            'confidence': 0.5,
            'rationale': f"Error: {str(e)}"
        }

print("✓ Pipeline function defined")

## Run on Test Set

In [None]:
# Process test examples
results = []
novel_cache = {}

print("\nProcessing test examples...")
print("=" * 60)

for example in tqdm(test_data[:5]):  # Limit for demo
    result = process_example(
        story_id=example['story_id'],
        novel_file=example['novel_file'],
        backstory=example['backstory'],
        novel_cache=novel_cache
    )
    results.append(result)
    
    print(f"\n{result['story_id']}: {result['prediction']} (conf: {result['confidence']:.2f})")

print(f"\n✓ Processed {len(results)} examples")

## Save Results

In [None]:
# Save to CSV
output_path = os.path.join(CONFIG['results_dir'], 'results.csv')
save_results(results, output_path)

print(f"\n✓ Results saved to {output_path}")

# Display
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print("=" * 60)
print(results_df)

## Summary

In [None]:
print_section("PIPELINE COMPLETE")

print("Results Summary:")
print(f"  Total processed: {len(results)}")
print(f"  Consistent (1): {sum(1 for r in results if r['prediction'] == 1)}")
print(f"  Inconsistent (0): {sum(1 for r in results if r['prediction'] == 0)}")
print(f"  Average confidence: {sum(r['confidence'] for r in results) / len(results):.2%}")

print("\nNext steps:")
print("1. Review results.csv")
print("2. Analyze rationales for errors")
print("3. Calibrate on full training set")
print("4. Run on complete test set")