### Modifications from Original
- Changed LLModuleConfig to take in registry, bug in README (Have not fixed README, however)
- Created custom hallucination detection signatures, and operators
- Fixed some refactorings and forgotten imports

In [1]:
import os
import logging
from typing import List, Dict, Any, Optional

# 1) Import our dataset registry tools:
from src.avior.registry.dataset.registry.metadata_registry import DatasetMetadataRegistry
from src.avior.registry.dataset.registry.loader_factory import DatasetLoaderFactory
from src.avior.registry.dataset.registry.initialization import initialize_dataset_registry

# 2) Import or define dataset loader/validator/sampler:
# If you have existing ones, import them. For now, we'll assume defaults or mocks.
from src.avior.registry.dataset.base.loaders import HuggingFaceDatasetLoader, IDatasetLoader
from src.avior.registry.dataset.base.validators import IDatasetValidator
from src.avior.registry.dataset.base.samplers import IDatasetSampler
from src.avior.registry.dataset.base.models import DatasetInfo, DatasetEntry, TaskType
from src.avior.registry.dataset.base.preppers import IDatasetPrepper
from src.avior.registry.dataset.datasets.mmlu import MMLUConfig
from src.avior.registry.dataset.base.validators import DatasetValidator
from src.avior.registry.dataset.base.samplers import DatasetSampler
from src.avior.registry.dataset.datasets.halueval import HaluEvalConfig

# 3) Import the DatasetService to actually use the pipeline:
from src.avior.registry.dataset.registry.service import DatasetService

In [2]:
from src.avior.registry.model.registry.model_registry import ModelRegistry
from src.avior.registry.model.schemas.model_info import ModelInfo
from src.avior.registry.model.schemas.provider_info import ProviderInfo
from src.avior.registry.model.schemas.cost import ModelCost, RateLimit
from src.avior.registry.model.services.usage_service import UsageService
from src.avior.registry.model.services.model_service import ModelService

In [3]:
from src.avior.registry.model.config import initialize_global_registry, GLOBAL_MODEL_REGISTRY
from src.avior.registry.model.services.model_service import ModelService

In [4]:
from src.avior.registry.model.registry.model_enum import OpenAIModelEnum as OME

In [5]:
from src.avior.registry.model.config import AviorSettings

settings = AviorSettings()

In [6]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [7]:
# 1) Create a metadata registry and loader factory:
metadata_registry = DatasetMetadataRegistry()
loader_factory = DatasetLoaderFactory()

In [8]:
# 2) Initialize the registry with known “built-in” datasets:
initialize_dataset_registry(metadata_registry, loader_factory)

INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: truthful_qa
INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: mmlu
INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: commonsense_qa
INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: halueval
INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: my_shortanswer_ds
INFO:src.avior.registry.dataset.registry.loader_factory:Registered loader prepper for dataset: my_code_ds
INFO:src.avior.registry.dataset.registry.initialization:Initialized dataset registry with known datasets.


In [9]:
# 3) Optionally, discover any additional plugin-based preppers from pyproject.toml:
loader_factory.discover_and_register_plugins()

INFO:src.avior.registry.dataset.registry.loader_factory:Auto-registered plugin preppers: []


In [10]:
#Cursor Suggestion

# 6) Construct a dataset loader, validator, and sampler:
loader: IDatasetLoader = HuggingFaceDatasetLoader()
validator: IDatasetValidator = DatasetValidator()
sampler: IDatasetSampler = DatasetSampler()

# 7) Instantiate a DatasetService to handle load, validation, transform, sampling, and prep:
dataset_service = DatasetService(
    loader=loader,
    validator=validator,
    sampler=sampler,
    transformers=[]  # Insert any specialized transformers if needed
)

In [11]:
!pip install python-dotenv
%load_ext dotenv
%dotenv



In [12]:
# Example of using pydantic-based config or environment variables
openai_key = settings.openai_api_key or os.getenv("OPENAI_API_KEY", "")

In [13]:
from src.avior.registry.model.registry.model_registry import ModelRegistry
from src.avior.registry.operator.operator_registry import EnsembleOperator, GetAnswerOperator
from src.avior.registry.operator.operator_base import LMModuleConfig, LMModule

# 1) Register the models
registry = ModelRegistry()
# (Imagine we've done registry.register_model(...) for each: gemini, claude, gpt-4o, etc.)




In [14]:
# Register each model
openai_provider = ProviderInfo(
    name="OpenAI", 
    default_api_key=openai_key,
    base_url="https://api.openai.com"
)

# GPT-4o
gpt4o_info = ModelInfo(
    model_id="openai:gpt-4o",
    model_name="gpt-4o",
    cost=ModelCost(
        input_cost_per_million=5000,   # $5.00 per million input tokens
        output_cost_per_million=15000  # $15.00 per million output tokens
    ),
    rate_limit=RateLimit(
        tokens_per_minute=10000000,    # 10M tokens per minute
        requests_per_minute=1500       # Tier 5 rate limit
    ),
    provider=openai_provider,
    api_key=openai_key
)

# GPT-4o-mini
gpt4o_mini_info = ModelInfo(
    model_id="openai:gpt-4o-mini",
    model_name="gpt-4o-mini",
    cost=ModelCost(
        input_cost_per_million=150,    # $0.15 per million input tokens
        output_cost_per_million=600    # $0.60 per million output tokens
    ),
    rate_limit=RateLimit(
        tokens_per_minute=10000000,
        requests_per_minute=1500
    ),
    provider=openai_provider,
    api_key=openai_key
)

# O1
o1_info = ModelInfo(
    model_id="openai:o1",
    model_name="o1",
    cost=ModelCost(
        input_cost_per_million=10000,  # $10.00 per million input tokens
        output_cost_per_million=20000  # $20.00 per million output tokens
    ),
    rate_limit=RateLimit(
        tokens_per_minute=5000000,
        requests_per_minute=1000
    ),
    provider=openai_provider,
    api_key=openai_key
)

# Register all models
registry.register_model(gpt4o_info)
registry.register_model(gpt4o_mini_info)
registry.register_model(o1_info)


#### README Example

In [15]:
# Creating a usage service to use for the model service
usage_service = UsageService()

# Create a model service using the registry
model_service = ModelService(registry=registry, usage_service=usage_service)

# 2) Create LMModules for each
# gemini_mod = LMModule(LMModuleConfig(model_name="gemini-1.5-pro"), model_service) #TODO might be bug in README, registry instead of model_service
# claude_mod = LMModule(LMModuleConfig(model_name="claude-3.5-sonnet"), model_service)
# Just using OpenAI models for now
g4o_mod = LMModule(LMModuleConfig(model_name="gpt-4o"), model_service)
g4omini_mod = LMModule(LMModuleConfig(model_name="gpt-4o-mini"), model_service)
go1_mod = LMModule(LMModuleConfig(model_name="o1"), model_service)

# 3) Instantiate an EnsembleOperator
ensemble_op = EnsembleOperator(lm_modules=[g4o_mod, g4omini_mod, go1_mod])

# 4) Instantiate a "Judge" operator (GetAnswerOperator or MostCommonOperator)
#    Here let's assume "GetAnswerOperator" uses a 'final_judge' LMModule
#judge_mod = LMModule(LMModuleConfig(model_name="o1-mini"), model_service) # I don't think there's an o1-mini
judge_mod = LMModule(LMModuleConfig(model_name="o1"), model_service) # I don't think there's an o1-mini
judge_op = GetAnswerOperator(lm_modules=[judge_mod])

In [16]:
from src.avior.core.graph_executor import NoNGraphData, GraphExecutorService

graph_data = NoNGraphData()
# Node: "ensemble"
graph_data.add_node(
    name="ensemble", 
    operator=ensemble_op, 
    inputs=[]  # no prior node dependencies
)
# Node: "judge"
graph_data.add_node(
    name="judge",
    operator=judge_op,
    inputs=["ensemble"]  # feed ensemble output into judge
)

# 5) Provide the final input to the graph
input_data = {
        "query": "Explain how to set up Avior for multi-model parallel usage"
}

# 6) Execute the graph
executor_service = GraphExecutorService()
results = executor_service.run(graph_data=graph_data, input_data=input_data)

print("Final answer from the judge:", results["final_answer"])

INFO:GraphExecutorService:GraphExecutorService run invoked.
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:src.avior.registry.model.provider_registry.openai.openai_provider:OpenAI forward() invoked
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Final answer from the judge: Setting up Avior for multi-model parallel usage involves several steps to ensure proper configuration and efficient execution of models in parallel. Avior, much like other parallel computing frameworks, allows for distributing computational tasks across multiple nodes or processing units, which can vastly improve performance and efficiency in handling large machine learning models. Below is a general guide to setting it up:

### Step 1: Environment Setup

1. **Hardware Requirements**:
   - Ensure you have sufficient hardware resources, such as multiple CPUs or GPUs, to support parallel execution.
   - Confirm that your system supports AVX or similar instruction sets, as parallel processing often relies on such capabilities.

2. **Software Installation**:
   - Install any necessary dependencies for Avior, such as MPI (Message Passing Interface) or other parallel computing libraries.
   - Ensure you have the correct version of Python and necessary machine lea

#### HaluEval LLM as a judge

In [17]:
from src.avior.core.graph_executor import NoNGraphData, GraphExecutorService
from src.avior.registry.model.config import initialize_global_registry, AviorSettings
from src.avior.registry.model.services.model_service import ModelService
from src.avior.modules.lm_modules import LMModule, LMModuleConfig
from src.avior.registry.operator.hallucination_operators import (
    QAHallucinationOperator,
    DialogueHallucinationOperator,
    SummarizationHallucinationOperator
)
import os

def setup_hallucination_detection():

    #Explicitly set the path to your .env file
    env_path = "/Users/kunalagrawal/Desktop/Research/ember/.env"
    
    # Initialize settings with explicit env file path
    settings = AviorSettings(_env_file=env_path)
    
    # Set API keys before initialization
    settings.openai_api_key = os.getenv("OPENAI_API_KEY", "")
    #settings.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "")
    #settings.google_api_key = os.getenv("GOOGLE_API_KEY", "")

    usage_service = UsageService()

    # Create a model service using the registry
    model_service = ModelService(registry=registry, usage_service=usage_service)
    
    # 2) Create LMModules for each
    # gemini_mod = LMModule(LMModuleConfig(model_name="gemini-1.5-pro"), model_service) #TODO might be bug in README, registry instead of model_service
    # claude_mod = LMModule(LMModuleConfig(model_name="claude-3.5-sonnet"), model_service)
    
    # Just using OpenAI models for now
    g4o_mod = LMModule(LMModuleConfig(model_name="gpt-4o"), model_service)
    g4omini_mod = LMModule(LMModuleConfig(model_name="gpt-4o-mini"), model_service)
    go1_mod = LMModule(LMModuleConfig(model_name="o1"), model_service)

    # Create LM modules for different models

    # Create appropriate operator based on task
    lm_modules = [g4o_mod, g4omini_mod, go1_mod]

    #Hardcoding to qa task for now
    operator = QAHallucinationOperator(lm_modules)
    
    # Create graph
    graph_data = NoNGraphData()
    graph_data.add_node(
        name="detector",
        operator=operator,
        inputs=[]
    )

    return graph_data

In [18]:
def run_hallucination_detection(**kwargs):
    graph_data = setup_hallucination_detection()
    
    # The query is already formatted in the HaluEval format from the input
    query = kwargs.get("query", "")

    input_data = {
        "query": kwargs.get("query"),
        "choices": kwargs.get("choices")
    }
    
    executor_service = GraphExecutorService()
    results = executor_service.run(graph_data=graph_data, input_data=input_data)
    
    if "detector" not in results:
        return results
    return results["detector"]

# Example usage
if __name__ == "__main__":
    qa_example = {
        "query": "Knowledge: Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.\nQuestion: Which magazine was started first Arthur's Magazine or First for Women?\nCandidate Answer: First for Women was started first.. Is this candidate answer supported by the provided knowledge?",
        "choices": {
            "A": "Not Hallucinated",
            "B": "Hallucinated"
        },
        "metadata": {
            "correct_answer": "B"
        }
    }
    qa_result = run_hallucination_detection(**qa_example)
    print(f"\nQA Hallucination Check:")
    print(f"Query: {qa_example['query']}")
    print(f"Result: {qa_result}")

NameError: name 'awards' is not defined

In [None]:
# Main Evaluation Logic
import pandas as pd
halu_df = pd.DataFrame(columns=['query','judgement','correct_answer'])


# 9) Let's do the same for HaluEval:
halu_info: Optional[DatasetInfo] = metadata_registry.get("halueval")
if not halu_info:
    raise ValueError("HaluEval dataset not properly registered.")

halu_prepper_class = loader_factory.get_prepper_class("halueval")
if not halu_prepper_class:
    raise ValueError("No HaluEval prepper found. Make sure it's registered.")

# Create config & prepper, defaulting to config_name="qa", split="data"
halu_config = HaluEvalConfig()
halu_prepper: IDatasetPrepper = halu_prepper_class(config=halu_config)

logger.info(f"Loading and preparing dataset: {halu_info.name}")
try:
    halu_dataset_entries: List[DatasetEntry] = dataset_service.load_and_prepare(
        dataset_info=halu_info,
        prepper=halu_prepper,
        config=halu_config,
        num_samples=3
    )
    logger.info(f"Received {len(halu_dataset_entries)} prepared entries for '{halu_info.name}'.")
    for i, entry in enumerate(halu_dataset_entries):
        data_entry = entry.model_dump()
        result = run_hallucination_detection(**data_entry)
        print(f"\nQA Hallucination:")
        print(f"[HaluEval] Entry #{i+1}:\n{data_entry}")
        print(f"Result: {result}")
        new_row = {"query": data_entry['query'], "judgement": result.judgement, "correct_answer": data_entry['metadata']['correct_answer']} 
        halu_df = pd.concat([halu_df, pd.DataFrame([new_row])], ignore_index=True)
except Exception as e:
    logger.error(f"Error during HaluEval dataset preparation: {e}")

In [None]:
halu_df