# Self-Verification Chains for Hallucination-Free RAG

**CIS 6930: Special Topics in Large Language Models (Fall 2025)**  
**University of Florida**

## Overview

This notebook implements and evaluates the Self-Verification RAG pipeline with:
- Hybrid retrieval (FAISS + BM25 fusion)
- Cross-encoder reranking
- FLAN-T5 generation with QLoRA fine-tuning
- Entailment-based verification
- Adaptive revision strategies

## Objectives

1. **Retrieval**: Achieve Recall@20 ≥ 0.95 and Coverage ≥ 0.90
2. **Verification**: Achieve Factual Precision ≥ 0.90 and Hallucination Rate ≤ 0.10
3. **Composite**: Achieve Verified F1 ≥ 0.52
4. **Statistical Significance**: All improvements with p < 0.05



In [None]:
# Setup and Imports
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(''))))

import yaml
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any
from tqdm import tqdm
import json

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")



In [None]:
# Load Configuration
config_path = "config/config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"  Retrieval: Dense={config['retrieval']['fusion']['dense_weight']}, "
      f"Sparse={config['retrieval']['fusion']['sparse_weight']}")
print(f"  Verification threshold (τ): {config['verification']['threshold']}")
print(f"  QLoRA enabled: {config['generation']['qlora']['training_enabled']}")



## 1. Dataset Loading

Load the dataset (Natural Questions, SQuAD, or TriviaQA).


In [None]:
# TODO: Load dataset
# For example, using Natural Questions or SQuAD
# from datasets import load_dataset
# 
# dataset = load_dataset("natural_questions", split="train")
# 
# # Extract queries, contexts, and answers
# queries = [item["question"]["text"] for item in dataset]
# contexts = [item["document"]["text"] for item in dataset]
# answers = [item["annotations"]["short_answers"][0]["text"] if item["annotations"]["short_answers"] else "" 
#            for item in dataset]

print("Dataset loading placeholder")
print("Replace with actual dataset loading code")



## 2. Initialize Pipeline

Initialize the Self-Verification RAG pipeline with all components.


In [None]:
from src.pipeline import SelfVerificationRAGPipeline

# Initialize pipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# TODO: Replace with actual corpus
corpus = []  # Placeholder

# Initialize pipeline
pipeline = SelfVerificationRAGPipeline(
    corpus=corpus,
    device=device,
    enable_revision=True,
    use_qlora=config["generation"]["qlora"]["training_enabled"]
)

print("Pipeline initialized successfully!")



## 3. Experiment 1: Baseline Comparison

Compare standard RAG vs Self-Verification RAG.


In [None]:
# Run baseline experiment
from experiments.exp1_baseline import run_baseline_experiment

# TODO: Replace with actual data
# aggregated, results = run_baseline_experiment(
#     queries, ground_truths, relevant_docs, corpus, config
# )

print("Experiment 1: Baseline Comparison")
print("See experiments/exp1_baseline.py for implementation")



## 4. Experiment 3: Threshold Tuning

Find optimal entailment threshold τ.


In [None]:
# Run threshold tuning experiment
from experiments.exp3_threshold_tuning import run_threshold_tuning

# TODO: Replace with actual data
# threshold_results, optimal_threshold = run_threshold_tuning(
#     queries, ground_truths, relevant_docs, corpus, config
# )

print("Experiment 3: Threshold Tuning")
print(f"Thresholds to test: {config['verification']['threshold_sweep']}")
print("See experiments/exp3_threshold_tuning.py for implementation")



## 5. Results Summary

Summarize results from all experiments.


In [None]:
# Load and summarize results
import json
import os

results_summary = {}

# Load results from each experiment
experiment_files = {
    "exp1": "results/exp1_baseline.json",
    "exp2": "results/exp2_retrieval_comparison.json",
    "exp3": "results/exp3_threshold_tuning.json",
    "exp4": "results/exp4_revision_strategies.json",
    "exp5": "results/exp5_decoding_strategies.json",
    "exp6": "results/exp6_iterative_training.json",
    "exp7": "results/exp7_ablation_study.json",
    "exp8": "results/exp8_stress_test.json"
}

for exp_name, exp_file in experiment_files.items():
    if os.path.exists(exp_file):
        with open(exp_file, 'r') as f:
            results_summary[exp_name] = json.load(f)
        print(f"Loaded {exp_name} results")

print(f"\nLoaded {len(results_summary)} experiment result files")



## 6. Key Metrics Summary

Display key metrics in table format.


In [None]:
# Create summary table
metrics_table = []

# Key metrics to track
key_metrics = [
    "recall@20", "coverage", "factual_precision",
    "hallucination_rate", "verified_f1", "f1_score"
]

# TODO: Populate with actual results
# For now, placeholder structure
print("Key Metrics Summary:")
print("=" * 60)
print(f"{'Metric':<20} {'Target':<15} {'Achieved':<15}")
print("=" * 60)
print(f"{'Recall@20':<20} {'≥0.95':<15} {'TBD':<15}")
print(f"{'Coverage':<20} {'≥0.90':<15} {'TBD':<15}")
print(f"{'Factual Precision':<20} {'≥0.90':<15} {'TBD':<15}")
print(f"{'Hallucination Rate':<20} {'≤0.10':<15} {'TBD':<15}")
print(f"{'Verified F1':<20} {'≥0.52':<15} {'TBD':<15}")
print("=" * 60)

