# üèÜ WINNING PIPELINE - KDSH COMPETITION

Complete pipeline with Pathway RAG and aggressive inconsistency detection.

## Features:
- ‚úÖ Pathway document store with vector embeddings
- ‚úÖ Multi-stage RAG retrieval
- ‚úÖ Enhanced claim extraction (age/date focus)
- ‚úÖ Comprehensive conflict detection
- ‚úÖ Competition-optimized threshold (0.30)

In [None]:
# Cell 1: Setup
import sys
sys.path.append('..')

from core import (
    PathwayDocumentStore, HierarchicalNarrativeMemory,
    ClaimExtractor, ConstraintBuilder, MultiHopRetriever,
    CausalReasoningEngine, TemporalReasoningEngine,
    InconsistencyScorer, ConsistencyClassifier,
    load_csv_data, save_results, print_section
)
import pandas as pd
from tqdm.notebook import tqdm
import os

print("üèÜ WINNING PIPELINE - PATHWAY RAG COMPETITION")
print("="*60)
print("‚úì All modules imported successfully!")

In [None]:
# Cell 2: Configuration
CONFIG = {
    'chunk_size': 1000,
    'max_hops': 3,
    'top_k_evidence': 5,
    'threshold': 0.30,  # CRITICAL
}

print("üéØ COMPETITION CONFIGURATION:")
print("="*60)
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

print("\nüìä KEY FEATURES:")
print("  ‚úì Pathway vector search")
print("  ‚úì Multi-hop RAG retrieval")
print("  ‚úì Enhanced age/date detection")
print("  ‚úì Boosted conflict severities (0.88-0.95)")
print("  ‚úì Lower threshold for higher recall")

In [None]:
# Cell 3: Initialize Components
print_section("INITIALIZING COMPONENTS")

document_store = PathwayDocumentStore(
    embedding_model=None,
    chunk_size=CONFIG['chunk_size']
)
print("‚úì Pathway document store initialized")

memory = HierarchicalNarrativeMemory()
claim_extractor = ClaimExtractor()
constraint_builder = ConstraintBuilder()
scorer = InconsistencyScorer()
classifier = ConsistencyClassifier(threshold=CONFIG['threshold'])

print("‚úì All reasoning engines initialized")

In [None]:
# Cell 4: Load Test Data
print_section("LOADING TEST DATA")

test_path = '../data/test.csv'
test_data = load_csv_data(test_path)

print(f"‚úì Loaded {len(test_data)} test examples")
print(f"\nFirst 3 examples:")
print("="*60)
for i, example in enumerate(test_data[:3], 1):
    print(f"\n{i}. ID: {example.get('id')}")
    print(f"   Novel: {example.get('book_name')}")
    print(f"   Backstory: {example.get('content')[:80]}...")

In [None]:
# Cell 5: Ingest Novels (Pathway)
print_section("PATHWAY INGESTION")

novels_ingested = {}
unique_novels = set(ex.get('book_name') for ex in test_data)
print(f"Unique novels to ingest: {len(unique_novels)}")

for novel_file in unique_novels:
    print(f"\nüìñ Ingesting: {novel_file}")
    
    possible_paths = [
        f'../data/novels/{novel_file}.txt',
        f'../data/novels/{novel_file}',
    ]
    
    novel_path = None
    for path in possible_paths:
        if os.path.exists(path):
            novel_path = path
            break
    
    if novel_path:
        with open(novel_path, 'r', encoding='utf-8') as f:
            novel_text = f.read()
        
        chunk_ids = document_store.ingest_novel(
            novel_text=novel_text,
            novel_id=novel_file,
            metadata={'filename': novel_file}
        )
        
        novels_ingested[novel_file] = len(chunk_ids)
        print(f"  ‚úì Created {len(chunk_ids)} chunks")
    else:
        print(f"  ‚ö† Novel file not found: {novel_file}")

print(f"\n‚úì Total novels ingested: {len(novels_ingested)}")
print(f"‚úì Total chunks: {len(document_store.documents)}")

In [None]:
# Cell 6: Process All Examples
print_section("PROCESSING ALL TEST EXAMPLES")

results = []
novel_cache = {}

for example in tqdm(test_data, desc="Processing"):
    story_id = example.get('id')
    novel_file = example.get('book_name')
    backstory = example.get('content')
    
    try:
        # Build memory (cached)
        if novel_file not in novel_cache:
            chunks = []
            for chunk_id, doc in document_store.documents.items():
                if document_store.chunk_to_doc.get(chunk_id) == novel_file:
                    chunks.append({
                        'chunk_id': chunk_id,
                        'text': doc.text,
                        'metadata': doc.metadata
                    })
            
            local_memory = HierarchicalNarrativeMemory()
            local_memory.extract_narrative_from_chunks(chunks, novel_file)
            novel_cache[novel_file] = local_memory
        else:
            local_memory = novel_cache[novel_file]
        
        # Extract claims (AGGRESSIVE)
        claims = claim_extractor.extract_claims_aggressive(backstory)
        
        # Build constraints
        constraint_graph = constraint_builder.build_graph(claims)
        
        # Retrieve evidence (Pathway RAG)
        retriever = MultiHopRetriever(document_store, max_hops=CONFIG['max_hops'])
        evidence_map = {}
        for claim in claims:
            evidence = retriever.retrieve_evidence(
                query=claim.text,
                novel_id=novel_file,
                top_k_per_hop=CONFIG['top_k_evidence'],
                rerank=True
            )
            evidence_map[claim.claim_id] = evidence
        
        # Reasoning engines
        causal_engine = CausalReasoningEngine(local_memory, constraint_graph)
        temporal_engine = TemporalReasoningEngine(local_memory, constraint_graph)
        
        temporal_engine.build_timeline(claims, evidence_map)
        temporal_conflicts = temporal_engine.check_temporal_consistency(claims, evidence_map)
        causal_conflicts = causal_engine.check_causal_consistency(claims, evidence_map)
        
        # Scoring
        score_result = scorer.score_backstory(
            claims=claims,
            evidence_map=evidence_map,
            temporal_conflicts=temporal_conflicts,
            causal_conflicts=causal_conflicts,
            memory=local_memory
        )
        
        # Classification
        classification = classifier.classify(
            inconsistency_score=score_result['overall_inconsistency'],
            temporal_conflicts=temporal_conflicts,
            causal_conflicts=causal_conflicts,
            evidence_map=evidence_map,
            claims=claims
        )
        
        results.append({
            'id': story_id,
            'prediction': classification['prediction'],
            'confidence': classification['confidence'],
            'rationale': classification['rationale']
        })
        
    except Exception as e:
        print(f"\n‚ùå Error on {story_id}: {e}")
        import traceback
        traceback.print_exc()
        results.append({
            'id': story_id,
            'prediction': 0,
            'confidence': 0.5,
            'rationale': f"Error: {str(e)[:100]}"
        })

print(f"\n‚úì Processed {len(results)} examples")

In [None]:
# Cell 7: Save Results
print_section("SAVING RESULTS")

output_path = '../results/predictions.csv'
os.makedirs('../results', exist_ok=True)

save_results(results, output_path)
print(f"‚úì Results saved to: {output_path}")

# Display
results_df = pd.DataFrame(results)
print("\nResults Preview:")
print(results_df.head(10))

In [None]:
# Cell 8: Analysis
print_section("FINAL ANALYSIS")

total = len(results)
consistent = sum(1 for r in results if r['prediction'] == 1)
inconsistent = total - consistent

print(f"üìä SUMMARY STATISTICS")
print("="*60)
print(f"Total processed: {total}")
print(f"Consistent (1): {consistent} ({consistent/total*100:.1f}%)")
print(f"Inconsistent (0): {inconsistent} ({inconsistent/total*100:.1f}%)")

if results:
    avg_conf = sum(r['confidence'] for r in results) / len(results)
    print(f"Average confidence: {avg_conf:.2%}")

print(f"\nüéØ TARGET CHECK")
print("="*60)
print(f"Competition baseline: ~36% inconsistent")
print(f"Our detection rate: {inconsistent/total*100:.1f}%")

if abs(inconsistent/total - 0.36) < 0.05:
    print("‚úÖ Within target range!")
elif inconsistent/total < 0.31:
    print("‚ö† Detecting too few - tuning needed")
elif inconsistent/total > 0.41:
    print("‚ö† Detecting too many - tuning needed")

print(f"\nüèÜ READY FOR SUBMISSION!")
print("="*60)
print(f"Submit: {output_path}")