diff --git a/ldai/__init__.py b/ldai/__init__.py
index cb7e545..78125d7 100644
--- a/ldai/__init__.py
+++ b/ldai/__init__.py
@@ -1 +1,41 @@
 __version__ = "0.10.1"  # x-release-please-version
+
+# Export main client
+# Export chat
+from ldai.chat import TrackedChat
+from ldai.client import LDAIClient
+# Export judge
+from ldai.judge import AIJudge
+# Export models for convenience
+from ldai.models import (  # Deprecated aliases for backward compatibility
+    AIAgentConfig, AIAgentConfigDefault, AIAgentConfigRequest, AIAgents,
+    AICompletionConfig, AICompletionConfigDefault, AIConfig, AIJudgeConfig,
+    AIJudgeConfigDefault, JudgeConfiguration, LDAIAgent, LDAIAgentConfig,
+    LDAIAgentDefaults, LDMessage, ModelConfig, ProviderConfig)
+# Export judge types
+from ldai.providers.types import EvalScore, JudgeResponse
+
+__all__ = [
+    'LDAIClient',
+    'AIAgentConfig',
+    'AIAgentConfigDefault',
+    'AIAgentConfigRequest',
+    'AIAgents',
+    'AICompletionConfig',
+    'AICompletionConfigDefault',
+    'AIJudgeConfig',
+    'AIJudgeConfigDefault',
+    'AIJudge',
+    'TrackedChat',
+    'EvalScore',
+    'JudgeConfiguration',
+    'JudgeResponse',
+    'LDMessage',
+    'ModelConfig',
+    'ProviderConfig',
+    # Deprecated exports
+    'AIConfig',
+    'LDAIAgent',
+    'LDAIAgentConfig',
+    'LDAIAgentDefaults',
+]
diff --git a/ldai/chat/__init__.py b/ldai/chat/__init__.py
new file mode 100644
index 0000000..bcb4284
--- /dev/null
+++ b/ldai/chat/__init__.py
@@ -0,0 +1,188 @@
+"""TrackedChat implementation for managing AI chat conversations."""
+
+import asyncio
+from typing import Any, Dict, List, Optional
+
+from ldai.judge import AIJudge
+from ldai.models import AICompletionConfig, LDMessage
+from ldai.providers.ai_provider import AIProvider
+from ldai.providers.types import ChatResponse, JudgeResponse
+from ldai.tracker import LDAIConfigTracker
+
+
+class TrackedChat:
+    """
+    Concrete implementation of TrackedChat that provides chat functionality
+    by delegating to an AIProvider implementation.
+
+    This class handles conversation management and tracking, while delegating
+    the actual model invocation to the provider.
+    """
+
+    def __init__(
+        self,
+        ai_config: AICompletionConfig,
+        tracker: LDAIConfigTracker,
+        provider: AIProvider,
+        judges: Optional[Dict[str, AIJudge]] = None,
+        logger: Optional[Any] = None,
+    ):
+        """
+        Initialize the TrackedChat.
+
+        :param ai_config: The completion AI configuration
+        :param tracker: The tracker for the completion configuration
+        :param provider: The AI provider to use for chat
+        :param judges: Optional dictionary of judge instances keyed by their configuration keys
+        :param logger: Optional logger for logging
+        """
+        self._ai_config = ai_config
+        self._tracker = tracker
+        self._provider = provider
+        self._judges = judges or {}
+        self._logger = logger
+        self._messages: List[LDMessage] = []
+
+    async def invoke(self, prompt: str) -> ChatResponse:
+        """
+        Invoke the chat model with a prompt string.
+
+        This method handles conversation management and tracking, delegating to the provider's invoke_model method.
+
+        :param prompt: The user prompt to send to the chat model
+        :return: ChatResponse containing the model's response and metrics
+        """
+        # Convert prompt string to LDMessage with role 'user' and add to conversation history
+        user_message: LDMessage = LDMessage(role='user', content=prompt)
+        self._messages.append(user_message)
+
+        # Prepend config messages to conversation history for model invocation
+        config_messages = self._ai_config.messages or []
+        all_messages = config_messages + self._messages
+
+        # Delegate to provider-specific implementation with tracking
+        response = await self._tracker.track_metrics_of(
+            lambda result: result.metrics,
+            lambda: self._provider.invoke_model(all_messages),
+        )
+
+        # Start judge evaluations as async tasks (don't await them)
+        if (
+            self._ai_config.judge_configuration
+            and self._ai_config.judge_configuration.judges
+            and len(self._ai_config.judge_configuration.judges) > 0
+        ):
+            evaluation_tasks = self._start_judge_evaluations(self._messages, response)
+            response.evaluations = evaluation_tasks
+
+        # Add the response message to conversation history
+        self._messages.append(response.message)
+        return response
+
+    def _start_judge_evaluations(
+        self,
+        messages: List[LDMessage],
+        response: ChatResponse,
+    ) -> List[asyncio.Task[Optional[JudgeResponse]]]:
+        """
+        Start judge evaluations as async tasks without awaiting them.
+
+        Returns a list of async tasks that can be awaited later.
+
+        :param messages: Array of messages representing the conversation history
+        :param response: The AI response to be evaluated
+        :return: List of async tasks that will return judge evaluation results
+        """
+        if not self._ai_config.judge_configuration or not self._ai_config.judge_configuration.judges:
+            return []
+
+        judge_configs = self._ai_config.judge_configuration.judges
+
+        # Start all judge evaluations as tasks
+        async def evaluate_judge(judge_config):
+            judge = self._judges.get(judge_config.key)
+            if not judge:
+                if self._logger:
+                    self._logger.warn(
+                        f"Judge configuration is not enabled: {judge_config.key}",
+                    )
+                return None
+
+            eval_result = await judge.evaluate_messages(
+                messages, response, judge_config.sampling_rate
+            )
+
+            if eval_result and eval_result.success:
+                self._tracker.track_eval_scores(eval_result.evals)
+
+            return eval_result
+
+        # Create tasks for each judge evaluation
+        tasks = [
+            asyncio.create_task(evaluate_judge(judge_config))
+            for judge_config in judge_configs
+        ]
+
+        return tasks
+
+    def get_config(self) -> AICompletionConfig:
+        """
+        Get the underlying AI configuration used to initialize this TrackedChat.
+
+        :return: The AI completion configuration
+        """
+        return self._ai_config
+
+    def get_tracker(self) -> LDAIConfigTracker:
+        """
+        Get the underlying AI configuration tracker used to initialize this TrackedChat.
+
+        :return: The tracker instance
+        """
+        return self._tracker
+
+    def get_provider(self) -> AIProvider:
+        """
+        Get the underlying AI provider instance.
+
+        This provides direct access to the provider for advanced use cases.
+
+        :return: The AI provider instance
+        """
+        return self._provider
+
+    def get_judges(self) -> Dict[str, AIJudge]:
+        """
+        Get the judges associated with this TrackedChat.
+
+        Returns a dictionary of judge instances keyed by their configuration keys.
+
+        :return: Dictionary of judge instances
+        """
+        return self._judges
+
+    def append_messages(self, messages: List[LDMessage]) -> None:
+        """
+        Append messages to the conversation history.
+
+        Adds messages to the conversation history without invoking the model,
+        which is useful for managing multi-turn conversations or injecting context.
+
+        :param messages: Array of messages to append to the conversation history
+        """
+        self._messages.extend(messages)
+
+    def get_messages(self, include_config_messages: bool = False) -> List[LDMessage]:
+        """
+        Get all messages in the conversation history.
+
+        :param include_config_messages: Whether to include the config messages from the AIConfig.
+                                       Defaults to False.
+        :return: Array of messages. When include_config_messages is True, returns both config
+                messages and conversation history with config messages prepended. When False,
+                returns only the conversation history messages.
+        """
+        if include_config_messages:
+            config_messages = self._ai_config.messages or []
+            return config_messages + self._messages
+        return list(self._messages)
diff --git a/ldai/client.py b/ldai/client.py
index a8bd888..086e99b 100644
--- a/ldai/client.py
+++ b/ldai/client.py
@@ -1,245 +1,308 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Literal, Optional, Tuple
+import logging
+from typing import Any, Dict, List, Optional, Tuple
 
 import chevron
 from ldclient import Context
 from ldclient.client import LDClient
 
+from ldai.chat import TrackedChat
+from ldai.judge import AIJudge
+from ldai.models import (AIAgentConfig, AIAgentConfigDefault,
+                         AIAgentConfigRequest, AIAgents, AICompletionConfig,
+                         AICompletionConfigDefault, AIJudgeConfig,
+                         AIJudgeConfigDefault, JudgeConfiguration, LDMessage,
+                         ModelConfig, ProviderConfig)
+from ldai.providers.ai_provider_factory import (AIProviderFactory,
+                                                SupportedAIProvider)
 from ldai.tracker import LDAIConfigTracker
 
 
-@dataclass
-class LDMessage:
-    role: Literal['system', 'user', 'assistant']
-    content: str
+class LDAIClient:
+    """The LaunchDarkly AI SDK client object."""
+
+    def __init__(self, client: LDClient):
+        self._client = client
+        self._logger = logging.getLogger('ldclient.ai')
 
-    def to_dict(self) -> dict:
+    def completion_config(
+        self,
+        key: str,
+        context: Context,
+        default_value: AICompletionConfigDefault,
+        variables: Optional[Dict[str, Any]] = None,
+    ) -> AICompletionConfig:
         """
-        Render the given message as a dictionary object.
+        Get the value of a completion configuration.
+
+        :param key: The key of the completion configuration.
+        :param context: The context to evaluate the completion configuration in.
+        :param default_value: The default value of the completion configuration.
+        :param variables: Additional variables for the completion configuration.
+        :return: The completion configuration with a tracker used for gathering metrics.
         """
-        return {
-            'role': self.role,
-            'content': self.content,
-        }
+        self._client.track('$ld:ai:config:function:single', context, key, 1)
 
+        model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
+            key, context, default_value.to_dict(), variables
+        )
 
-class ModelConfig:
-    """
-    Configuration related to the model.
-    """
+        config = AICompletionConfig(
+            enabled=bool(enabled),
+            model=model,
+            messages=messages,
+            provider=provider,
+            tracker=tracker,
+            judge_configuration=judge_configuration,
+        )
 
-    def __init__(self, name: str, parameters: Optional[Dict[str, Any]] = None, custom: Optional[Dict[str, Any]] = None):
-        """
-        :param name: The name of the model.
-        :param parameters: Additional model-specific parameters.
-        :param custom: Additional customer provided data.
-        """
-        self._name = name
-        self._parameters = parameters
-        self._custom = custom
+        return config
 
-    @property
-    def name(self) -> str:
-        """
-        The name of the model.
+    def config(
+        self,
+        key: str,
+        context: Context,
+        default_value: AICompletionConfigDefault,
+        variables: Optional[Dict[str, Any]] = None,
+    ) -> AICompletionConfig:
         """
-        return self._name
+        Get the value of a model configuration.
 
-    def get_parameter(self, key: str) -> Any:
-        """
-        Retrieve model-specific parameters.
+        .. deprecated:: Use :meth:`completion_config` instead. This method will be removed in a future version.
 
-        Accessing a named, typed attribute (e.g. name) will result in the call
-        being delegated to the appropriate property.
+        :param key: The key of the model configuration.
+        :param context: The context to evaluate the model configuration in.
+        :param default_value: The default value of the model configuration.
+        :param variables: Additional variables for the model configuration.
+        :return: The value of the model configuration along with a tracker used for gathering metrics.
         """
-        if key == 'name':
-            return self.name
-
-        if self._parameters is None:
-            return None
-
-        return self._parameters.get(key)
+        return self.completion_config(key, context, default_value, variables)
 
-    def get_custom(self, key: str) -> Any:
-        """
-        Retrieve customer provided data.
+    def judge_config(
+        self,
+        key: str,
+        context: Context,
+        default_value: AIJudgeConfigDefault,
+        variables: Optional[Dict[str, Any]] = None,
+    ) -> AIJudgeConfig:
         """
-        if self._custom is None:
-            return None
-
-        return self._custom.get(key)
+        Get the value of a judge configuration.
 
-    def to_dict(self) -> dict:
-        """
-        Render the given model config as a dictionary object.
+        :param key: The key of the judge configuration.
+        :param context: The context to evaluate the judge configuration in.
+        :param default_value: The default value of the judge configuration.
+        :param variables: Additional variables for the judge configuration.
+        :return: The judge configuration with a tracker used for gathering metrics.
         """
-        return {
-            'name': self._name,
-            'parameters': self._parameters,
-            'custom': self._custom,
-        }
+        self._client.track('$ld:ai:judge:function:single', context, key, 1)
 
+        model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
+            key, context, default_value.to_dict(), variables
+        )
 
-class ProviderConfig:
-    """
-    Configuration related to the provider.
-    """
+        # Extract evaluation_metric_keys from the variation
+        variation = self._client.variation(key, context, default_value.to_dict())
+        evaluation_metric_keys = variation.get('evaluationMetricKeys', default_value.evaluation_metric_keys or [])
 
-    def __init__(self, name: str):
-        self._name = name
+        config = AIJudgeConfig(
+            enabled=bool(enabled),
+            evaluation_metric_keys=evaluation_metric_keys,
+            model=model,
+            messages=messages,
+            provider=provider,
+            tracker=tracker,
+        )
 
-    @property
-    def name(self) -> str:
-        """
-        The name of the provider.
-        """
-        return self._name
+        return config
 
-    def to_dict(self) -> dict:
-        """
-        Render the given provider config as a dictionary object.
+    async def create_judge(
+        self,
+        key: str,
+        context: Context,
+        default_value: AIJudgeConfigDefault,
+        variables: Optional[Dict[str, Any]] = None,
+        default_ai_provider: Optional[SupportedAIProvider] = None,
+    ) -> Optional[AIJudge]:
         """
-        return {
-            'name': self._name,
-        }
-
+        Creates and returns a new Judge instance for AI evaluation.
 
-@dataclass(frozen=True)
-class AIConfig:
-    enabled: Optional[bool] = None
-    model: Optional[ModelConfig] = None
-    messages: Optional[List[LDMessage]] = None
-    provider: Optional[ProviderConfig] = None
-
-    def to_dict(self) -> dict:
-        """
-        Render the given default values as an AIConfig-compatible dictionary object.
-        """
-        return {
-            '_ldMeta': {
-                'enabled': self.enabled or False,
-            },
-            'model': self.model.to_dict() if self.model else None,
-            'messages': [message.to_dict() for message in self.messages] if self.messages else None,
-            'provider': self.provider.to_dict() if self.provider else None,
-        }
-
-
-@dataclass(frozen=True)
-class LDAIAgent:
-    """
-    Represents an AI agent configuration with instructions and model settings.
-
-    An agent is similar to an AIConfig but focuses on instructions rather than messages,
-    making it suitable for AI assistant/agent use cases.
-    """
-    enabled: Optional[bool] = None
-    model: Optional[ModelConfig] = None
-    provider: Optional[ProviderConfig] = None
-    instructions: Optional[str] = None
-    tracker: Optional[LDAIConfigTracker] = None
-
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Render the given agent as a dictionary object.
-        """
-        result: Dict[str, Any] = {
-            '_ldMeta': {
-                'enabled': self.enabled or False,
-            },
-            'model': self.model.to_dict() if self.model else None,
-            'provider': self.provider.to_dict() if self.provider else None,
-        }
-        if self.instructions is not None:
-            result['instructions'] = self.instructions
-        return result
+        :param key: The key identifying the AI judge configuration to use
+        :param context: Standard Context used when evaluating flags
+        :param default_value: A default value representing a standard AI config result
+        :param variables: Dictionary of values for instruction interpolation.
+            The variables `message_history` and `response_to_evaluate` are reserved for the judge and will be ignored.
+        :param default_ai_provider: Optional default AI provider to use.
+        :return: Judge instance or None if disabled/unsupported
 
+        Example::
 
-@dataclass(frozen=True)
-class LDAIAgentDefaults:
-    """
-    Default values for AI agent configurations.
+            judge = client.create_judge(
+                "relevance-judge",
+                context,
+                AIJudgeConfigDefault(
+                    enabled=True,
+                    model=ModelConfig("gpt-4"),
+                    provider=ProviderConfig("openai"),
+                    evaluation_metric_keys=['$ld:ai:judge:relevance'],
+                    messages=[LDMessage(role='system', content='You are a relevance judge.')]
+                ),
+                variables={'metric': "relevance"}
+            )
 
-    Similar to LDAIAgent but without tracker and with optional enabled field,
-    used as fallback values when agent configurations are not available.
-    """
-    enabled: Optional[bool] = None
-    model: Optional[ModelConfig] = None
-    provider: Optional[ProviderConfig] = None
-    instructions: Optional[str] = None
+            if judge:
+                result = await judge.evaluate("User question", "AI response")
+                if result and result.evals:
+                    relevance_eval = result.evals.get('$ld:ai:judge:relevance')
+                    if relevance_eval:
+                        print('Relevance score:', relevance_eval.score)
+        """
+        self._client.track('$ld:ai:judge:function:createJudge', context, key, 1)
+
+        try:
+            # Warn if reserved variables are provided
+            if variables:
+                if 'message_history' in variables:
+                    # Note: Python doesn't have a logger on the client, but we could add one
+                    pass  # Would log warning if logger available
+                if 'response_to_evaluate' in variables:
+                    pass  # Would log warning if logger available
+
+            # Overwrite reserved variables to ensure they remain as placeholders for judge evaluation
+            extended_variables = dict(variables) if variables else {}
+            extended_variables['message_history'] = '{{message_history}}'
+            extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}'
+
+            judge_config = self.judge_config(key, context, default_value, extended_variables)
+
+            if not judge_config.enabled or not judge_config.tracker:
+                # Would log info if logger available
+                return None
+
+            # Create AI provider for the judge
+            provider = await AIProviderFactory.create(judge_config, self._logger, default_ai_provider)
+            if not provider:
+                return None
+
+            return AIJudge(judge_config, judge_config.tracker, provider, self._logger)
+        except Exception as error:
+            # Would log error if logger available
+            return None
 
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Render the given agent defaults as a dictionary object.
+    async def _initialize_judges(
+        self,
+        judge_configs: List[JudgeConfiguration.Judge],
+        context: Context,
+        variables: Optional[Dict[str, Any]] = None,
+        default_ai_provider: Optional[SupportedAIProvider] = None,
+    ) -> Dict[str, AIJudge]:
         """
-        result: Dict[str, Any] = {
-            '_ldMeta': {
-                'enabled': self.enabled or False,
-            },
-            'model': self.model.to_dict() if self.model else None,
-            'provider': self.provider.to_dict() if self.provider else None,
-        }
-        if self.instructions is not None:
-            result['instructions'] = self.instructions
-        return result
-
+        Initialize judges from judge configurations.
 
-@dataclass
-class LDAIAgentConfig:
-    """
-    Configuration for individual agent in batch requests.
+        :param judge_configs: List of judge configurations
+        :param context: Standard Context used when evaluating flags
+        :param variables: Dictionary of values for instruction interpolation
+        :param default_ai_provider: Optional default AI provider to use
+        :return: Dictionary of judge instances keyed by their configuration keys
+        """
+        judges: Dict[str, AIJudge] = {}
 
-    Combines agent key with its specific default configuration and variables.
-    """
-    key: str
-    default_value: LDAIAgentDefaults
-    variables: Optional[Dict[str, Any]] = None
+        async def create_judge_for_config(judge_key: str):
+            judge = await self.create_judge(
+                judge_key,
+                context,
+                AIJudgeConfigDefault(enabled=False),
+                variables,
+                default_ai_provider,
+            )
+            return judge_key, judge
 
+        judge_promises = [
+            create_judge_for_config(judge_config.key)
+            for judge_config in judge_configs
+        ]
 
-# Type alias for multiple agents
-LDAIAgents = Dict[str, LDAIAgent]
+        import asyncio
+        results = await asyncio.gather(*judge_promises, return_exceptions=True)
 
+        for result in results:
+            if isinstance(result, Exception):
+                continue
+            judge_key, judge = result  # type: ignore[misc]
+            if judge:
+                judges[judge_key] = judge
 
-class LDAIClient:
-    """The LaunchDarkly AI SDK client object."""
+        return judges
 
-    def __init__(self, client: LDClient):
-        self._client = client
-
-    def config(
+    async def create_chat(
         self,
         key: str,
         context: Context,
-        default_value: AIConfig,
+        default_value: AICompletionConfigDefault,
         variables: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[AIConfig, LDAIConfigTracker]:
+        default_ai_provider: Optional[SupportedAIProvider] = None,
+    ) -> Optional[TrackedChat]:
         """
-        Get the value of a model configuration.
+        Creates and returns a new TrackedChat instance for AI chat conversations.
 
-        :param key: The key of the model configuration.
-        :param context: The context to evaluate the model configuration in.
-        :param default_value: The default value of the model configuration.
-        :param variables: Additional variables for the model configuration.
-        :return: The value of the model configuration along with a tracker used for gathering metrics.
+        :param key: The key identifying the AI completion configuration to use
+        :param context: Standard Context used when evaluating flags
+        :param default_value: A default value representing a standard AI config result
+        :param variables: Dictionary of values for instruction interpolation
+        :param default_ai_provider: Optional default AI provider to use
+        :return: TrackedChat instance or None if disabled/unsupported
+
+        Example::
+
+            chat = await client.create_chat(
+                "customer-support-chat",
+                context,
+                AICompletionConfigDefault(
+                    enabled=True,
+                    model=ModelConfig("gpt-4"),
+                    provider=ProviderConfig("openai"),
+                    messages=[LDMessage(role='system', content='You are a helpful assistant.')]
+                ),
+                variables={'customerName': 'John'}
+            )
+
+            if chat:
+                response = await chat.invoke("I need help with my order")
+                print(response.message.content)
+
+                # Access conversation history
+                messages = chat.get_messages()
+                print(f"Conversation has {len(messages)} messages")
         """
-        self._client.track('$ld:ai:config:function:single', context, key, 1)
+        self._client.track('$ld:ai:config:function:createChat', context, key, 1)
+        if self._logger:
+            self._logger.debug(f"Creating chat for key: {key}")
+        config = self.completion_config(key, context, default_value, variables)
 
-        model, provider, messages, instructions, tracker, enabled = self.__evaluate(key, context, default_value.to_dict(), variables)
+        if not config.enabled or not config.tracker:
+            # Would log info if logger available
+            return None
 
-        config = AIConfig(
-            enabled=bool(enabled),
-            model=model,
-            messages=messages,
-            provider=provider,
-        )
+        provider = await AIProviderFactory.create(config, self._logger, default_ai_provider)
+        if not provider:
+            return None
 
-        return config, tracker
+        judges = {}
+        if config.judge_configuration and config.judge_configuration.judges:
+            judges = await self._initialize_judges(
+                config.judge_configuration.judges,
+                context,
+                variables,
+                default_ai_provider,
+            )
 
-    def agent(
+        return TrackedChat(config, config.tracker, provider, judges, self._logger)
+
+    def agent_config(
         self,
-        config: LDAIAgentConfig,
+        key: str,
         context: Context,
-    ) -> LDAIAgent:
+        default_value: AIAgentConfigDefault,
+        variables: Optional[Dict[str, Any]] = None,
+    ) -> AIAgentConfig:
         """
         Retrieve a single AI Config agent.
 
@@ -248,39 +311,58 @@ def agent(
 
         Example::
 
-            agent = client.agent(LDAIAgentConfig(
-                key='research_agent',
-                default_value=LDAIAgentDefaults(
+            agent = client.agent_config(
+                'research_agent',
+                context,
+                AIAgentConfigDefault(
                     enabled=True,
                     model=ModelConfig('gpt-4'),
                     instructions="You are a research assistant specializing in {{topic}}."
                 ),
                 variables={'topic': 'climate change'}
-            ), context)
+            )
 
             if agent.enabled:
                 research_result = agent.instructions  # Interpolated instructions
                 agent.tracker.track_success()
 
-        :param config: The agent configuration to use.
+        :param key: The agent configuration key.
         :param context: The context to evaluate the agent configuration in.
-        :return: Configured LDAIAgent instance.
+        :param default_value: Default agent values.
+        :param variables: Variables for interpolation.
+        :return: Configured AIAgentConfig instance.
         """
         # Track single agent usage
         self._client.track(
             "$ld:ai:agent:function:single",
             context,
-            config.key,
+            key,
             1
         )
 
-        return self.__evaluate_agent(config.key, context, config.default_value, config.variables)
+        return self.__evaluate_agent(key, context, default_value, variables)
 
-    def agents(
+    def agent(
+        self,
+        config: AIAgentConfigRequest,
+        context: Context,
+    ) -> AIAgentConfig:
+        """
+        Retrieve a single AI Config agent.
+
+        .. deprecated:: Use :meth:`agent_config` instead. This method will be removed in a future version.
+
+        :param config: The agent configuration to use.
+        :param context: The context to evaluate the agent configuration in.
+        :return: Configured AIAgentConfig instance.
+        """
+        return self.agent_config(config.key, context, config.default_value, config.variables)
+
+    def agent_configs(
         self,
-        agent_configs: List[LDAIAgentConfig],
+        agent_configs: List[AIAgentConfigRequest],
         context: Context,
-    ) -> LDAIAgents:
+    ) -> AIAgents:
         """
         Retrieve multiple AI agent configurations.
 
@@ -290,18 +372,18 @@ def agents(
 
         Example::
 
-            agents = client.agents([
-                LDAIAgentConfig(
+            agents = client.agent_configs([
+                AIAgentConfigRequest(
                     key='research_agent',
-                    default_value=LDAIAgentDefaults(
+                    default_value=AIAgentConfigDefault(
                         enabled=True,
                         instructions='You are a research assistant.'
                     ),
                     variables={'topic': 'climate change'}
                 ),
-                LDAIAgentConfig(
+                AIAgentConfigRequest(
                     key='writing_agent',
-                    default_value=LDAIAgentDefaults(
+                    default_value=AIAgentConfigDefault(
                         enabled=True,
                         instructions='You are a writing assistant.'
                     ),
@@ -314,7 +396,7 @@ def agents(
 
         :param agent_configs: List of agent configurations to retrieve.
         :param context: The context to evaluate the agent configurations in.
-        :return: Dictionary mapping agent keys to their LDAIAgent configurations.
+        :return: Dictionary mapping agent keys to their AIAgentConfig configurations.
         """
         # Track multiple agents usage
         agent_count = len(agent_configs)
@@ -325,7 +407,7 @@ def agents(
             agent_count
         )
 
-        result: LDAIAgents = {}
+        result: AIAgents = {}
 
         for config in agent_configs:
             agent = self.__evaluate_agent(
@@ -338,13 +420,29 @@ def agents(
 
         return result
 
+    def agents(
+        self,
+        agent_configs: List[AIAgentConfigRequest],
+        context: Context,
+    ) -> AIAgents:
+        """
+        Retrieve multiple AI agent configurations.
+
+        .. deprecated:: Use :meth:`agent_configs` instead. This method will be removed in a future version.
+
+        :param agent_configs: List of agent configurations to retrieve.
+        :param context: The context to evaluate the agent configurations in.
+        :return: Dictionary mapping agent keys to their AIAgentConfig configurations.
+        """
+        return self.agent_configs(agent_configs, context)
+
     def __evaluate(
         self,
         key: str,
         context: Context,
         default_dict: Dict[str, Any],
         variables: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[Optional[ModelConfig], Optional[ProviderConfig], Optional[List[LDMessage]], Optional[str], LDAIConfigTracker, bool]:
+    ) -> Tuple[Optional[ModelConfig], Optional[ProviderConfig], Optional[List[LDMessage]], Optional[str], LDAIConfigTracker, bool, Optional[Any]]:
         """
         Internal method to evaluate a configuration and extract components.
 
@@ -411,15 +509,31 @@ def __evaluate(
 
         enabled = variation.get('_ldMeta', {}).get('enabled', False)
 
-        return model, provider_config, messages, instructions, tracker, enabled
+        # Extract judge configuration
+        judge_configuration = None
+        if 'judgeConfiguration' in variation and isinstance(variation['judgeConfiguration'], dict):
+            judge_config = variation['judgeConfiguration']
+            if 'judges' in judge_config and isinstance(judge_config['judges'], list):
+                judges = [
+                    JudgeConfiguration.Judge(
+                        key=judge['key'],
+                        sampling_rate=judge['samplingRate']
+                    )
+                    for judge in judge_config['judges']
+                    if isinstance(judge, dict) and 'key' in judge and 'samplingRate' in judge
+                ]
+                if judges:
+                    judge_configuration = JudgeConfiguration(judges=judges)
+
+        return model, provider_config, messages, instructions, tracker, enabled, judge_configuration
 
     def __evaluate_agent(
         self,
         key: str,
         context: Context,
-        default_value: LDAIAgentDefaults,
+        default_value: AIAgentConfigDefault,
         variables: Optional[Dict[str, Any]] = None,
-    ) -> LDAIAgent:
+    ) -> AIAgentConfig:
         """
         Internal method to evaluate an agent configuration.
 
@@ -427,21 +541,22 @@ def __evaluate_agent(
         :param context: The evaluation context.
         :param default_value: Default agent values.
         :param variables: Variables for interpolation.
-        :return: Configured LDAIAgent instance.
+        :return: Configured AIAgentConfig instance.
         """
-        model, provider, messages, instructions, tracker, enabled = self.__evaluate(
+        model, provider, messages, instructions, tracker, enabled, judge_configuration = self.__evaluate(
             key, context, default_value.to_dict(), variables
         )
 
         # For agents, prioritize instructions over messages
         final_instructions = instructions if instructions is not None else default_value.instructions
 
-        return LDAIAgent(
-            enabled=bool(enabled) if enabled is not None else default_value.enabled,
+        return AIAgentConfig(
+            enabled=bool(enabled) if enabled is not None else (default_value.enabled or False),
             model=model or default_value.model,
             provider=provider or default_value.provider,
             instructions=final_instructions,
             tracker=tracker,
+            judge_configuration=judge_configuration or default_value.judge_configuration,
         )
 
     def __interpolate_template(self, template: str, variables: Dict[str, Any]) -> str:
diff --git a/ldai/judge/__init__.py b/ldai/judge/__init__.py
new file mode 100644
index 0000000..3caad65
--- /dev/null
+++ b/ldai/judge/__init__.py
@@ -0,0 +1,230 @@
+"""Judge implementation for AI evaluation."""
+
+import random
+from typing import Any, Dict, Optional
+
+import chevron
+
+from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder
+from ldai.models import AIJudgeConfig, LDMessage
+from ldai.providers.ai_provider import AIProvider
+from ldai.providers.types import (ChatResponse, EvalScore, JudgeResponse,
+                                  StructuredResponse)
+from ldai.tracker import LDAIConfigTracker
+
+
+class AIJudge:
+    """
+    Judge implementation that handles evaluation functionality and conversation management.
+
+    According to the AIEval spec, judges are AI Configs with mode: "judge" that evaluate
+    other AI Configs using structured output.
+    """
+
+    def __init__(
+        self,
+        ai_config: AIJudgeConfig,
+        ai_config_tracker: LDAIConfigTracker,
+        ai_provider: AIProvider,
+        logger: Optional[Any] = None,
+    ):
+        """
+        Initialize the Judge.
+
+        :param ai_config: The judge AI configuration
+        :param ai_config_tracker: The tracker for the judge configuration
+        :param ai_provider: The AI provider to use for evaluation
+        :param logger: Optional logger for logging
+        """
+        self._ai_config = ai_config
+        self._ai_config_tracker = ai_config_tracker
+        self._ai_provider = ai_provider
+        self._logger = logger
+        self._evaluation_response_structure = EvaluationSchemaBuilder.build(
+            ai_config.evaluation_metric_keys
+        )
+
+    async def evaluate(
+        self,
+        input_text: str,
+        output_text: str,
+        sampling_rate: float = 1.0,
+    ) -> Optional[JudgeResponse]:
+        """
+        Evaluates an AI response using the judge's configuration.
+
+        :param input_text: The input prompt or question that was provided to the AI
+        :param output_text: The AI-generated response to be evaluated
+        :param sampling_rate: Sampling rate (0-1) to determine if evaluation should be processed (defaults to 1)
+        :return: Evaluation results or None if not sampled
+        """
+        try:
+            if not self._ai_config.evaluation_metric_keys or len(self._ai_config.evaluation_metric_keys) == 0:
+                if self._logger:
+                    self._logger.warn(
+                        'Judge configuration is missing required evaluationMetricKeys'
+                    )
+                return None
+
+            if not self._ai_config.messages:
+                if self._logger:
+                    self._logger.warn('Judge configuration must include messages')
+                return None
+
+            if random.random() > sampling_rate:
+                if self._logger:
+                    self._logger.debug(f'Judge evaluation skipped due to sampling rate: {sampling_rate}')
+                return None
+
+            messages = self._construct_evaluation_messages(input_text, output_text)
+
+            # Track metrics of the structured model invocation
+            response = await self._ai_config_tracker.track_metrics_of(
+                lambda result: result.metrics,
+                lambda: self._ai_provider.invoke_structured_model(messages, self._evaluation_response_structure)
+            )
+
+            success = response.metrics.success
+
+            evals = self._parse_evaluation_response(response.data)
+
+            if len(evals) != len(self._ai_config.evaluation_metric_keys):
+                if self._logger:
+                    self._logger.warn('Judge evaluation did not return all evaluations')
+                success = False
+
+            return JudgeResponse(
+                evals=evals,
+                success=success,
+            )
+        except Exception as error:
+            if self._logger:
+                self._logger.error(f'Judge evaluation failed: {error}')
+            return JudgeResponse(
+                evals={},
+                success=False,
+                error=str(error) if isinstance(error, Exception) else 'Unknown error',
+            )
+
+    async def evaluate_messages(
+        self,
+        messages: list[LDMessage],
+        response: ChatResponse,
+        sampling_ratio: float = 1.0,
+    ) -> Optional[JudgeResponse]:
+        """
+        Evaluates an AI response from chat messages and response.
+
+        :param messages: Array of messages representing the conversation history
+        :param response: The AI response to be evaluated
+        :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed (defaults to 1)
+        :return: Evaluation results or None if not sampled
+        """
+        input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
+        output_text = response.message.content
+
+        return await self.evaluate(input_text, output_text, sampling_ratio)
+
+    def get_ai_config(self) -> AIJudgeConfig:
+        """
+        Returns the AI Config used by this judge.
+
+        :return: The judge AI configuration
+        """
+        return self._ai_config
+
+    def get_tracker(self) -> LDAIConfigTracker:
+        """
+        Returns the tracker associated with this judge.
+
+        :return: The tracker for the judge configuration
+        """
+        return self._ai_config_tracker
+
+    def get_provider(self) -> AIProvider:
+        """
+        Returns the AI provider used by this judge.
+
+        :return: The AI provider
+        """
+        return self._ai_provider
+
+    def _construct_evaluation_messages(self, input_text: str, output_text: str) -> list[LDMessage]:
+        """
+        Constructs evaluation messages by combining judge's config messages with input/output.
+
+        :param input_text: The input text
+        :param output_text: The output text to evaluate
+        :return: List of messages for evaluation
+        """
+        if not self._ai_config.messages:
+            return []
+
+        messages: list[LDMessage] = []
+        for msg in self._ai_config.messages:
+            # Interpolate message content with reserved variables
+            content = self._interpolate_message(msg.content, {
+                'message_history': input_text,
+                'response_to_evaluate': output_text,
+            })
+            messages.append(LDMessage(role=msg.role, content=content))
+
+        return messages
+
+    def _interpolate_message(self, content: str, variables: Dict[str, str]) -> str:
+        """
+        Interpolates message content with variables using Mustache templating.
+
+        :param content: The message content template
+        :param variables: Variables to interpolate
+        :return: Interpolated message content
+        """
+        # Use chevron (Mustache) for templating, with no escaping
+        return chevron.render(content, variables)
+
+    def _parse_evaluation_response(self, data: Dict[str, Any]) -> Dict[str, EvalScore]:
+        """
+        Parses the structured evaluation response from the AI provider.
+
+        :param data: The structured response data
+        :return: Dictionary of evaluation scores keyed by metric key
+        """
+        results: Dict[str, EvalScore] = {}
+
+        if not data.get('evaluations') or not isinstance(data['evaluations'], dict):
+            if self._logger:
+                self._logger.warn('Invalid response: missing or invalid evaluations object')
+            return results
+
+        evaluations = data['evaluations']
+
+        for metric_key in self._ai_config.evaluation_metric_keys:
+            evaluation = evaluations.get(metric_key)
+
+            if not evaluation or not isinstance(evaluation, dict):
+                if self._logger:
+                    self._logger.warn(f'Missing evaluation for metric key: {metric_key}')
+                continue
+
+            score = evaluation.get('score')
+            reasoning = evaluation.get('reasoning')
+
+            if not isinstance(score, (int, float)) or score < 0 or score > 1:
+                if self._logger:
+                    self._logger.warn(
+                        f'Invalid score evaluated for {metric_key}: {score}. '
+                        'Score must be a number between 0 and 1 inclusive'
+                    )
+                continue
+
+            if not isinstance(reasoning, str):
+                if self._logger:
+                    self._logger.warn(
+                        f'Invalid reasoning evaluated for {metric_key}: {reasoning}. '
+                        'Reasoning must be a string'
+                    )
+                continue
+
+            results[metric_key] = EvalScore(score=float(score), reasoning=reasoning)
+
+        return results
diff --git a/ldai/judge/evaluation_schema_builder.py b/ldai/judge/evaluation_schema_builder.py
new file mode 100644
index 0000000..c996f08
--- /dev/null
+++ b/ldai/judge/evaluation_schema_builder.py
@@ -0,0 +1,74 @@
+"""Internal class for building dynamic evaluation response schemas."""
+
+from typing import Any, Dict
+
+
+class EvaluationSchemaBuilder:
+    """
+    Internal class for building dynamic evaluation response schemas.
+    Not exported - only used internally by Judge.
+    """
+
+    @staticmethod
+    def build(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
+        """
+        Build an evaluation response schema from evaluation metric keys.
+
+        :param evaluation_metric_keys: List of evaluation metric keys
+        :return: Schema dictionary for structured output
+        """
+        return {
+            'title': 'EvaluationResponse',
+            'description': f"Response containing evaluation results for {', '.join(evaluation_metric_keys)} metrics",
+            'type': 'object',
+            'properties': {
+                'evaluations': {
+                    'type': 'object',
+                    'description': f"Object containing evaluation results for {', '.join(evaluation_metric_keys)} metrics",
+                    'properties': EvaluationSchemaBuilder._build_key_properties(evaluation_metric_keys),
+                    'required': evaluation_metric_keys,
+                    'additionalProperties': False,
+                },
+            },
+            'required': ['evaluations'],
+            'additionalProperties': False,
+        }
+
+    @staticmethod
+    def _build_key_properties(evaluation_metric_keys: list[str]) -> Dict[str, Any]:
+        """
+        Build properties for each evaluation metric key.
+
+        :param evaluation_metric_keys: List of evaluation metric keys
+        :return: Dictionary of properties for each key
+        """
+        result: Dict[str, Any] = {}
+        for key in evaluation_metric_keys:
+            result[key] = EvaluationSchemaBuilder._build_key_schema(key)
+        return result
+
+    @staticmethod
+    def _build_key_schema(key: str) -> Dict[str, Any]:
+        """
+        Build schema for a single evaluation metric key.
+
+        :param key: Evaluation metric key
+        :return: Schema dictionary for the key
+        """
+        return {
+            'type': 'object',
+            'properties': {
+                'score': {
+                    'type': 'number',
+                    'minimum': 0,
+                    'maximum': 1,
+                    'description': f'Score between 0.0 and 1.0 for {key}',
+                },
+                'reasoning': {
+                    'type': 'string',
+                    'description': f'Reasoning behind the score for {key}',
+                },
+            },
+            'required': ['score', 'reasoning'],
+            'additionalProperties': False,
+        }
diff --git a/ldai/models.py b/ldai/models.py
new file mode 100644
index 0000000..c075dcf
--- /dev/null
+++ b/ldai/models.py
@@ -0,0 +1,357 @@
+import warnings
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from ldai.tracker import LDAIConfigTracker
+
+
+@dataclass
+class LDMessage:
+    role: Literal['system', 'user', 'assistant']
+    content: str
+
+    def to_dict(self) -> dict:
+        """
+        Render the given message as a dictionary object.
+        """
+        return {
+            'role': self.role,
+            'content': self.content,
+        }
+
+
+class ModelConfig:
+    """
+    Configuration related to the model.
+    """
+
+    def __init__(self, name: str, parameters: Optional[Dict[str, Any]] = None, custom: Optional[Dict[str, Any]] = None):
+        """
+        :param name: The name of the model.
+        :param parameters: Additional model-specific parameters.
+        :param custom: Additional customer provided data.
+        """
+        self._name = name
+        self._parameters = parameters
+        self._custom = custom
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the model.
+        """
+        return self._name
+
+    def get_parameter(self, key: str) -> Any:
+        """
+        Retrieve model-specific parameters.
+
+        Accessing a named, typed attribute (e.g. name) will result in the call
+        being delegated to the appropriate property.
+        """
+        if key == 'name':
+            return self.name
+
+        if self._parameters is None:
+            return None
+
+        return self._parameters.get(key)
+
+    def get_custom(self, key: str) -> Any:
+        """
+        Retrieve customer provided data.
+        """
+        if self._custom is None:
+            return None
+
+        return self._custom.get(key)
+
+    def to_dict(self) -> dict:
+        """
+        Render the given model config as a dictionary object.
+        """
+        return {
+            'name': self._name,
+            'parameters': self._parameters,
+            'custom': self._custom,
+        }
+
+
+class ProviderConfig:
+    """
+    Configuration related to the provider.
+    """
+
+    def __init__(self, name: str):
+        self._name = name
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the provider.
+        """
+        return self._name
+
+    def to_dict(self) -> dict:
+        """
+        Render the given provider config as a dictionary object.
+        """
+        return {
+            'name': self._name,
+        }
+
+
+# ============================================================================
+# Judge Types
+# ============================================================================
+
+@dataclass(frozen=True)
+class JudgeConfiguration:
+    """
+    Configuration for judge attachment to AI Configs.
+    """
+
+    @dataclass(frozen=True)
+    class Judge:
+        """
+        Configuration for a single judge attachment.
+        """
+        key: str
+        sampling_rate: float
+
+        def to_dict(self) -> dict:
+            """
+            Render the judge as a dictionary object.
+            """
+            return {
+                'key': self.key,
+                'samplingRate': self.sampling_rate,
+            }
+
+    judges: List['JudgeConfiguration.Judge']
+
+    def to_dict(self) -> dict:
+        """
+        Render the judge configuration as a dictionary object.
+        """
+        return {
+            'judges': [judge.to_dict() for judge in self.judges],
+        }
+
+
+# ============================================================================
+# Base AI Config Types
+# ============================================================================
+
+@dataclass(frozen=True)
+class AIConfigDefault:
+    """
+    Base AI Config interface for default implementations with optional enabled property.
+    """
+    enabled: Optional[bool] = None
+    model: Optional[ModelConfig] = None
+    provider: Optional[ProviderConfig] = None
+
+    def _base_to_dict(self) -> Dict[str, Any]:
+        """
+        Render the base config fields as a dictionary object.
+        """
+        return {
+            '_ldMeta': {
+                'enabled': self.enabled or False,
+            },
+            'model': self.model.to_dict() if self.model else None,
+            'provider': self.provider.to_dict() if self.provider else None,
+        }
+
+
+@dataclass(frozen=True)
+class AIConfig:
+    """
+    Base AI Config interface without mode-specific fields.
+    """
+    enabled: bool
+    model: Optional[ModelConfig] = None
+    provider: Optional[ProviderConfig] = None
+    tracker: Optional[LDAIConfigTracker] = None
+
+    def _base_to_dict(self) -> Dict[str, Any]:
+        """
+        Render the base config fields as a dictionary object.
+        """
+        return {
+            '_ldMeta': {
+                'enabled': self.enabled,
+            },
+            'model': self.model.to_dict() if self.model else None,
+            'provider': self.provider.to_dict() if self.provider else None,
+        }
+
+
+# ============================================================================
+# Completion Config Types
+# ============================================================================
+
+@dataclass(frozen=True)
+class AICompletionConfigDefault(AIConfigDefault):
+    """
+    Default Completion AI Config (default mode).
+    """
+    messages: Optional[List[LDMessage]] = None
+    judge_configuration: Optional[JudgeConfiguration] = None
+
+    def to_dict(self) -> dict:
+        """
+        Render the given default values as an AICompletionConfigDefault-compatible dictionary object.
+        """
+        result = self._base_to_dict()
+        result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
+        if self.judge_configuration is not None:
+            result['judgeConfiguration'] = self.judge_configuration.to_dict()
+        return result
+
+
+@dataclass(frozen=True)
+class AICompletionConfig(AIConfig):
+    """
+    Completion AI Config (default mode).
+    """
+    messages: Optional[List[LDMessage]] = None
+    judge_configuration: Optional[JudgeConfiguration] = None
+
+    def to_dict(self) -> dict:
+        """
+        Render the given completion config as a dictionary object.
+        """
+        result = self._base_to_dict()
+        result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
+        if self.judge_configuration is not None:
+            result['judgeConfiguration'] = self.judge_configuration.to_dict()
+        return result
+
+
+# ============================================================================
+# Agent Config Types
+# ============================================================================
+
+@dataclass(frozen=True)
+class AIAgentConfigDefault(AIConfigDefault):
+    """
+    Default Agent-specific AI Config with instructions.
+    """
+    instructions: Optional[str] = None
+    judge_configuration: Optional[JudgeConfiguration] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Render the given agent config default as a dictionary object.
+        """
+        result = self._base_to_dict()
+        if self.instructions is not None:
+            result['instructions'] = self.instructions
+        if self.judge_configuration is not None:
+            result['judgeConfiguration'] = self.judge_configuration.to_dict()
+        return result
+
+
+@dataclass(frozen=True)
+class AIAgentConfig(AIConfig):
+    """
+    Agent-specific AI Config with instructions.
+    """
+    instructions: Optional[str] = None
+    judge_configuration: Optional[JudgeConfiguration] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Render the given agent config as a dictionary object.
+        """
+        result = self._base_to_dict()
+        if self.instructions is not None:
+            result['instructions'] = self.instructions
+        if self.judge_configuration is not None:
+            result['judgeConfiguration'] = self.judge_configuration.to_dict()
+        return result
+
+
+# ============================================================================
+# Judge Config Types
+# ============================================================================
+
+@dataclass(frozen=True)
+class AIJudgeConfigDefault(AIConfigDefault):
+    """
+    Default Judge-specific AI Config with required evaluation metric key.
+    """
+    messages: Optional[List[LDMessage]] = None
+    evaluation_metric_keys: Optional[List[str]] = None
+
+    def to_dict(self) -> dict:
+        """
+        Render the given judge config default as a dictionary object.
+        """
+        result = self._base_to_dict()
+        result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
+        if self.evaluation_metric_keys is not None:
+            result['evaluationMetricKeys'] = self.evaluation_metric_keys
+        return result
+
+
+@dataclass(frozen=True)
+class AIJudgeConfig(AIConfig):
+    """
+    Judge-specific AI Config with required evaluation metric key.
+    """
+    evaluation_metric_keys: List[str] = field(default_factory=list)
+    messages: Optional[List[LDMessage]] = None
+
+    def to_dict(self) -> dict:
+        """
+        Render the given judge config as a dictionary object.
+        """
+        result = self._base_to_dict()
+        result['evaluationMetricKeys'] = self.evaluation_metric_keys
+        result['messages'] = [message.to_dict() for message in self.messages] if self.messages else None
+        return result
+
+
+# ============================================================================
+# Agent Request Config
+# ============================================================================
+
+@dataclass
+class AIAgentConfigRequest:
+    """
+    Configuration for a single agent request.
+
+    Combines agent key with its specific default configuration and variables.
+    """
+    key: str
+    default_value: AIAgentConfigDefault
+    variables: Optional[Dict[str, Any]] = None
+
+
+# Type alias for multiple agents
+AIAgents = Dict[str, AIAgentConfig]
+
+# Type alias for all AI Config variants
+AIConfigKind = Union[AIAgentConfig, AICompletionConfig, AIJudgeConfig]
+
+
+# ============================================================================
+# Deprecated Type Aliases for Backward Compatibility
+# ============================================================================
+
+# Note: AIConfig is now defined above as a base class (line 169).
+# For backward compatibility, code should migrate to:
+# - Use AICompletionConfigDefault for default/input values
+# - Use AICompletionConfig for return values
+
+# Deprecated: Use AIAgentConfigDefault instead
+LDAIAgentDefaults = AIAgentConfigDefault
+
+# Deprecated: Use AIAgentConfigRequest instead
+LDAIAgentConfig = AIAgentConfigRequest
+
+# Deprecated: Use AIAgentConfig instead (note: this was the old return type)
+LDAIAgent = AIAgentConfig
diff --git a/ldai/providers/__init__.py b/ldai/providers/__init__.py
new file mode 100644
index 0000000..48152cc
--- /dev/null
+++ b/ldai/providers/__init__.py
@@ -0,0 +1,28 @@
+"""AI Provider interfaces and factory for LaunchDarkly AI SDK."""
+
+from ldai.providers.ai_provider import AIProvider
+from ldai.providers.ai_provider_factory import (AIProviderFactory,
+                                                SupportedAIProvider)
+
+# Export LangChain provider if available
+# TODO: Uncomment when langchain provider package is introduced
+# try:
+#     from ldai.providers.langchain import LangChainProvider
+#     __all__ = [
+#         'AIProvider',
+#         'AIProviderFactory',
+#         'LangChainProvider',
+#         'SupportedAIProvider',
+#     ]
+# except ImportError:
+#     __all__ = [
+#         'AIProvider',
+#         'AIProviderFactory',
+#         'SupportedAIProvider',
+#     ]
+
+__all__ = [
+    'AIProvider',
+    'AIProviderFactory',
+    'SupportedAIProvider',
+]
diff --git a/ldai/providers/ai_provider.py b/ldai/providers/ai_provider.py
new file mode 100644
index 0000000..cc7b21e
--- /dev/null
+++ b/ldai/providers/ai_provider.py
@@ -0,0 +1,95 @@
+"""Abstract base class for AI providers."""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional, Union
+
+from ldai.models import AIConfigKind, LDMessage
+from ldai.providers.types import ChatResponse, StructuredResponse
+
+
+class AIProvider(ABC):
+    """
+    Abstract base class for AI providers that implement chat model functionality.
+
+    This class provides the contract that all provider implementations must follow
+    to integrate with LaunchDarkly's tracking and configuration capabilities.
+
+    Following the AICHAT spec recommendation to use base classes with non-abstract methods
+    for better extensibility and backwards compatibility.
+    """
+
+    def __init__(self, logger: Optional[Any] = None):
+        """
+        Initialize the AI provider.
+
+        :param logger: Optional logger for logging provider operations.
+        """
+        self.logger = logger
+
+    async def invoke_model(self, messages: List[LDMessage]) -> ChatResponse:
+        """
+        Invoke the chat model with an array of messages.
+
+        This method should convert messages to provider format, invoke the model,
+        and return a ChatResponse with the result and metrics.
+
+        Default implementation takes no action and returns a placeholder response.
+        Provider implementations should override this method.
+
+        :param messages: Array of LDMessage objects representing the conversation
+        :return: ChatResponse containing the model's response
+        """
+        if self.logger:
+            self.logger.warn('invokeModel not implemented by this provider')
+
+        from ldai.models import LDMessage
+        from ldai.providers.types import LDAIMetrics
+
+        return ChatResponse(
+            message=LDMessage(role='assistant', content=''),
+            metrics=LDAIMetrics(success=False, usage=None),
+        )
+
+    async def invoke_structured_model(
+        self,
+        messages: List[LDMessage],
+        response_structure: Dict[str, Any],
+    ) -> StructuredResponse:
+        """
+        Invoke the chat model with structured output support.
+
+        This method should convert messages to provider format, invoke the model with
+        structured output configuration, and return a structured response.
+
+        Default implementation takes no action and returns a placeholder response.
+        Provider implementations should override this method.
+
+        :param messages: Array of LDMessage objects representing the conversation
+        :param response_structure: Dictionary of output configurations keyed by output name
+        :return: StructuredResponse containing the structured data
+        """
+        if self.logger:
+            self.logger.warn('invokeStructuredModel not implemented by this provider')
+
+        from ldai.providers.types import LDAIMetrics
+
+        return StructuredResponse(
+            data={},
+            raw_response='',
+            metrics=LDAIMetrics(success=False, usage=None),
+        )
+
+    @staticmethod
+    @abstractmethod
+    async def create(ai_config: AIConfigKind, logger: Optional[Any] = None) -> 'AIProvider':
+        """
+        Static method that constructs an instance of the provider.
+
+        Each provider implementation must provide their own static create method
+        that accepts an AIConfigKind and returns a configured instance.
+
+        :param ai_config: The LaunchDarkly AI configuration
+        :param logger: Optional logger for the provider
+        :return: Configured provider instance
+        """
+        raise NotImplementedError('Provider implementations must override the static create method')
diff --git a/ldai/providers/ai_provider_factory.py b/ldai/providers/ai_provider_factory.py
new file mode 100644
index 0000000..3fd0f50
--- /dev/null
+++ b/ldai/providers/ai_provider_factory.py
@@ -0,0 +1,171 @@
+"""Factory for creating AIProvider instances based on the provider configuration."""
+
+import importlib
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type
+
+from ldai.models import AIConfigKind
+from ldai.providers.ai_provider import AIProvider
+
+# List of supported AI providers
+SUPPORTED_AI_PROVIDERS: List[str] = [
+    # Multi-provider packages should be last in the list
+    # 'langchain',  # TODO: Uncomment when langchain provider package is introduced
+]
+
+# Type representing the supported AI providers
+# TODO: Update this type when provider packages are introduced
+# SupportedAIProvider = Literal['langchain']
+SupportedAIProvider = Literal['none']  # Placeholder until providers are added
+
+
+class AIProviderFactory:
+    """
+    Factory for creating AIProvider instances based on the provider configuration.
+    """
+
+    @staticmethod
+    async def create(
+        ai_config: AIConfigKind,
+        logger: Optional[Any] = None,
+        default_ai_provider: Optional[SupportedAIProvider] = None,
+    ) -> Optional[AIProvider]:
+        """
+        Create an AIProvider instance based on the AI configuration.
+
+        This method attempts to load provider-specific implementations dynamically.
+        Returns None if the provider is not supported.
+
+        :param ai_config: The AI configuration
+        :param logger: Optional logger for logging provider initialization
+        :param default_ai_provider: Optional default AI provider to use
+        :return: AIProvider instance or None if not supported
+        """
+        provider_name = ai_config.provider.name.lower() if ai_config.provider else None
+        # Determine which providers to try based on default_ai_provider
+        providers_to_try = AIProviderFactory._get_providers_to_try(default_ai_provider, provider_name)
+
+        # Try each provider in order
+        for provider_type in providers_to_try:
+            provider = await AIProviderFactory._try_create_provider(provider_type, ai_config, logger)
+            if provider:
+                return provider
+
+        # If no provider was successfully created, log a warning
+        if logger:
+            logger.warn(
+                f"Provider is not supported or failed to initialize: {provider_name or 'unknown'}"
+            )
+        return None
+
+    @staticmethod
+    def _get_providers_to_try(
+        default_ai_provider: Optional[SupportedAIProvider],
+        provider_name: Optional[str],
+    ) -> List[SupportedAIProvider]:
+        """
+        Determine which providers to try based on default_ai_provider and provider_name.
+
+        :param default_ai_provider: Optional default provider to use
+        :param provider_name: Optional provider name from config
+        :return: List of providers to try in order
+        """
+        # If default_ai_provider is set, only try that specific provider
+        if default_ai_provider:
+            return [default_ai_provider]
+
+        # If no default_ai_provider is set, try all providers in order
+        provider_set = set()
+
+        # First try the specific provider if it's supported
+        if provider_name and provider_name in SUPPORTED_AI_PROVIDERS:
+            provider_set.add(provider_name)  # type: ignore
+
+        # Then try multi-provider packages, but avoid duplicates
+        # TODO: Uncomment when langchain provider package is introduced
+        # multi_provider_packages: List[SupportedAIProvider] = ['langchain']
+        # for provider in multi_provider_packages:
+        #     provider_set.add(provider)
+
+        # Return list of providers, converting from set
+        # The set contains strings that should be valid SupportedAIProvider values
+        return list(provider_set)  # type: ignore[arg-type]
+
+    @staticmethod
+    async def _try_create_provider(
+        provider_type: SupportedAIProvider,
+        ai_config: AIConfigKind,
+        logger: Optional[Any] = None,
+    ) -> Optional[AIProvider]:
+        """
+        Try to create a provider of the specified type.
+
+        :param provider_type: Type of provider to create
+        :param ai_config: AI configuration
+        :param logger: Optional logger
+        :return: AIProvider instance or None if creation failed
+        """
+        # Handle built-in providers (part of this package)
+        # TODO: Uncomment when langchain provider package is introduced
+        # if provider_type == 'langchain':
+        #     try:
+        #         from ldai.providers.langchain import LangChainProvider
+        #         return await LangChainProvider.create(ai_config, logger)
+        #     except ImportError as error:
+        #         if logger:
+        #             logger.warn(
+        #                 f"Error creating LangChainProvider: {error}. "
+        #                 f"Make sure langchain and langchain-core packages are installed."
+        #             )
+        #         return None
+
+        # For future external providers, use dynamic import
+        provider_mappings: Dict[str, Tuple[str, str]] = {
+            # 'openai': ('launchdarkly_server_sdk_ai_openai', 'OpenAIProvider'),
+            # 'vercel': ('launchdarkly_server_sdk_ai_vercel', 'VercelProvider'),
+        }
+
+        if provider_type not in provider_mappings:
+            return None
+
+        package_name, provider_class_name = provider_mappings[provider_type]
+        return await AIProviderFactory._create_provider(
+            package_name, provider_class_name, ai_config, logger
+        )
+
+    @staticmethod
+    async def _create_provider(
+        package_name: str,
+        provider_class_name: str,
+        ai_config: AIConfigKind,
+        logger: Optional[Any] = None,
+    ) -> Optional[AIProvider]:
+        """
+        Create a provider instance dynamically.
+
+        :param package_name: Name of the package containing the provider
+        :param provider_class_name: Name of the provider class
+        :param ai_config: AI configuration
+        :param logger: Optional logger
+        :return: AIProvider instance or None if creation failed
+        """
+        try:
+            # Try to dynamically import the provider
+            # This will work if the package is installed
+            module = importlib.import_module(package_name)
+            provider_class: Type[AIProvider] = getattr(module, provider_class_name)
+
+            provider = await provider_class.create(ai_config, logger)
+            if logger:
+                logger.debug(
+                    f"Successfully created AIProvider for: {ai_config.provider.name if ai_config.provider else 'unknown'} "
+                    f"with package {package_name}"
+                )
+            return provider
+        except (ImportError, AttributeError, Exception) as error:
+            # If the provider is not available or creation fails, return None
+            if logger:
+                logger.warn(
+                    f"Error creating AIProvider for: {ai_config.provider.name if ai_config.provider else 'unknown'} "
+                    f"with package {package_name}: {error}"
+                )
+            return None
diff --git a/ldai/providers/types.py b/ldai/providers/types.py
new file mode 100644
index 0000000..de54698
--- /dev/null
+++ b/ldai/providers/types.py
@@ -0,0 +1,91 @@
+"""Types for AI provider responses."""
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from ldai.models import LDMessage
+from ldai.tracker import TokenUsage
+
+
+@dataclass
+class LDAIMetrics:
+    """
+    Metrics information for AI operations that includes success status and token usage.
+    """
+    success: bool
+    usage: Optional[TokenUsage] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Render the metrics as a dictionary object.
+        """
+        result: Dict[str, Any] = {
+            'success': self.success,
+        }
+        if self.usage is not None:
+            result['usage'] = {
+                'total': self.usage.total,
+                'input': self.usage.input,
+                'output': self.usage.output,
+            }
+        return result
+
+
+@dataclass
+class ChatResponse:
+    """
+    Chat response structure.
+    """
+    message: LDMessage
+    metrics: LDAIMetrics
+    evaluations: Optional[List[Any]] = None  # List of JudgeResponse, will be populated later
+
+
+@dataclass
+class StructuredResponse:
+    """
+    Structured response from AI models.
+    """
+    data: Dict[str, Any]
+    raw_response: str
+    metrics: LDAIMetrics
+
+
+@dataclass
+class EvalScore:
+    """
+    Score and reasoning for a single evaluation metric.
+    """
+    score: float  # Score between 0.0 and 1.0
+    reasoning: str  # Reasoning behind the provided score
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Render the evaluation score as a dictionary object.
+        """
+        return {
+            'score': self.score,
+            'reasoning': self.reasoning,
+        }
+
+
+@dataclass
+class JudgeResponse:
+    """
+    Response from a judge evaluation containing scores and reasoning for multiple metrics.
+    """
+    evals: Dict[str, EvalScore]  # Dictionary where keys are metric names and values contain score and reasoning
+    success: bool  # Whether the evaluation completed successfully
+    error: Optional[str] = None  # Error message if evaluation failed
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Render the judge response as a dictionary object.
+        """
+        result: Dict[str, Any] = {
+            'evals': {key: eval_score.to_dict() for key, eval_score in self.evals.items()},
+            'success': self.success,
+        }
+        if self.error is not None:
+            result['error'] = self.error
+        return result
diff --git a/ldai/testing/test_agents.py b/ldai/testing/test_agents.py
index b2e80c0..755f2e5 100644
--- a/ldai/testing/test_agents.py
+++ b/ldai/testing/test_agents.py
@@ -2,8 +2,8 @@
 from ldclient import Config, Context, LDClient
 from ldclient.integrations.test_data import TestData
 
-from ldai.client import (LDAIAgentConfig, LDAIAgentDefaults, LDAIClient,
-                         ModelConfig, ProviderConfig)
+from ldai import (LDAIAgentConfig, LDAIAgentDefaults, LDAIClient, ModelConfig,
+                  ProviderConfig)
 
 
 @pytest.fixture
diff --git a/ldai/testing/test_model_config.py b/ldai/testing/test_model_config.py
index 1ffc033..26a02c9 100644
--- a/ldai/testing/test_model_config.py
+++ b/ldai/testing/test_model_config.py
@@ -2,7 +2,8 @@
 from ldclient import Config, Context, LDClient
 from ldclient.integrations.test_data import TestData
 
-from ldai.client import AIConfig, LDAIClient, LDMessage, ModelConfig
+from ldai import LDAIClient, LDMessage, ModelConfig
+from ldai.models import AICompletionConfigDefault
 
 
 @pytest.fixture
@@ -133,14 +134,14 @@ def test_model_config_handles_custom():
 
 def test_uses_default_on_invalid_flag(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(
+    default_value = AICompletionConfigDefault(
         enabled=True,
         model=ModelConfig('fakeModel', parameters={'temperature': 0.5, 'maxTokens': 4096}),
         messages=[LDMessage(role='system', content='Hello, {{name}}!')],
     )
     variables = {'name': 'World'}
 
-    config, _ = ldai_client.config('missing-flag', context, default_value, variables)
+    config = ldai_client.config('missing-flag', context, default_value, variables)
 
     assert config.messages is not None
     assert len(config.messages) > 0
@@ -155,14 +156,14 @@ def test_uses_default_on_invalid_flag(ldai_client: LDAIClient):
 
 def test_model_config_interpolation(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(
+    default_value = AICompletionConfigDefault(
         enabled=True,
         model=ModelConfig('fakeModel'),
         messages=[LDMessage(role='system', content='Hello, {{name}}!')],
     )
     variables = {'name': 'World'}
 
-    config, _ = ldai_client.config('model-config', context, default_value, variables)
+    config = ldai_client.config('model-config', context, default_value, variables)
 
     assert config.messages is not None
     assert len(config.messages) > 0
@@ -177,9 +178,9 @@ def test_model_config_interpolation(ldai_client: LDAIClient):
 
 def test_model_config_no_variables(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=True, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=True, model=ModelConfig('fake-model'), messages=[])
 
-    config, _ = ldai_client.config('model-config', context, default_value, {})
+    config = ldai_client.config('model-config', context, default_value, {})
 
     assert config.messages is not None
     assert len(config.messages) > 0
@@ -194,10 +195,10 @@ def test_model_config_no_variables(ldai_client: LDAIClient):
 
 def test_provider_config_handling(ldai_client: LDAIClient):
     context = Context.builder('user-key').name("Sandy").build()
-    default_value = AIConfig(enabled=True, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=True, model=ModelConfig('fake-model'), messages=[])
     variables = {'name': 'World'}
 
-    config, _ = ldai_client.config('model-config', context, default_value, variables)
+    config = ldai_client.config('model-config', context, default_value, variables)
 
     assert config.provider is not None
     assert config.provider.name == 'fakeProvider'
@@ -205,10 +206,10 @@ def test_provider_config_handling(ldai_client: LDAIClient):
 
 def test_context_interpolation(ldai_client: LDAIClient):
     context = Context.builder('user-key').name("Sandy").set('last', 'Beaches').build()
-    default_value = AIConfig(enabled=True, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=True, model=ModelConfig('fake-model'), messages=[])
     variables = {'name': 'World'}
 
-    config, _ = ldai_client.config(
+    config = ldai_client.config(
         'ctx-interpolation', context, default_value, variables
     )
 
@@ -228,10 +229,10 @@ def test_multi_context_interpolation(ldai_client: LDAIClient):
     user_context = Context.builder('user-key').name("Sandy").build()
     org_context = Context.builder('org-key').kind('org').name("LaunchDarkly").set('shortname', 'LD').build()
     context = Context.multi_builder().add(user_context).add(org_context).build()
-    default_value = AIConfig(enabled=True, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=True, model=ModelConfig('fake-model'), messages=[])
     variables = {'name': 'World'}
 
-    config, _ = ldai_client.config(
+    config = ldai_client.config(
         'multi-ctx-interpolation', context, default_value, variables
     )
 
@@ -249,10 +250,10 @@ def test_multi_context_interpolation(ldai_client: LDAIClient):
 
 def test_model_config_multiple(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=True, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=True, model=ModelConfig('fake-model'), messages=[])
     variables = {'name': 'World', 'day': 'Monday'}
 
-    config, _ = ldai_client.config(
+    config = ldai_client.config(
         'multiple-messages', context, default_value, variables
     )
 
@@ -270,9 +271,9 @@ def test_model_config_multiple(ldai_client: LDAIClient):
 
 def test_model_config_disabled(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=False, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=False, model=ModelConfig('fake-model'), messages=[])
 
-    config, _ = ldai_client.config('off-config', context, default_value, {})
+    config = ldai_client.config('off-config', context, default_value, {})
 
     assert config.model is not None
     assert config.enabled is False
@@ -283,9 +284,9 @@ def test_model_config_disabled(ldai_client: LDAIClient):
 
 def test_model_initial_config_disabled(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=False, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=False, model=ModelConfig('fake-model'), messages=[])
 
-    config, _ = ldai_client.config('initial-config-disabled', context, default_value, {})
+    config = ldai_client.config('initial-config-disabled', context, default_value, {})
 
     assert config.enabled is False
     assert config.model is None
@@ -295,9 +296,9 @@ def test_model_initial_config_disabled(ldai_client: LDAIClient):
 
 def test_model_initial_config_enabled(ldai_client: LDAIClient):
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=False, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=False, model=ModelConfig('fake-model'), messages=[])
 
-    config, _ = ldai_client.config('initial-config-enabled', context, default_value, {})
+    config = ldai_client.config('initial-config-enabled', context, default_value, {})
 
     assert config.enabled is True
     assert config.model is None
@@ -318,9 +319,9 @@ def test_config_method_tracking(ldai_client: LDAIClient):
 
     client = LDAIClient(mock_client)
     context = Context.create('user-key')
-    default_value = AIConfig(enabled=False, model=ModelConfig('fake-model'), messages=[])
+    default_value = AICompletionConfigDefault(enabled=False, model=ModelConfig('fake-model'), messages=[])
 
-    config, tracker = client.config('test-config-key', context, default_value)
+    config = client.config('test-config-key', context, default_value)
 
     mock_client.track.assert_called_once_with(
         '$ld:ai:config:function:single',
diff --git a/ldai/testing/test_tracker.py b/ldai/testing/test_tracker.py
index 19c8161..2e39d98 100644
--- a/ldai/testing/test_tracker.py
+++ b/ldai/testing/test_tracker.py
@@ -276,7 +276,8 @@ def test_tracks_bedrock_metrics_with_error(client: LDClient):
     assert tracker.get_summary().usage == TokenUsage(330, 220, 110)
 
 
-def test_tracks_openai_metrics(client: LDClient):
+@pytest.mark.asyncio
+async def test_tracks_openai_metrics(client: LDClient):
     context = Context.create("user-key")
     tracker = LDAIConfigTracker(client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context)
 
@@ -292,7 +293,10 @@ def to_dict(self):
                 "completion_tokens": 110,
             }
 
-    tracker.track_openai_metrics(lambda: Result())
+    async def get_result():
+        return Result()
+
+    await tracker.track_openai_metrics(get_result)
 
     calls = [
         call(
@@ -326,15 +330,16 @@ def to_dict(self):
     assert tracker.get_summary().usage == TokenUsage(330, 220, 110)
 
 
-def test_tracks_openai_metrics_with_exception(client: LDClient):
+@pytest.mark.asyncio
+async def test_tracks_openai_metrics_with_exception(client: LDClient):
     context = Context.create("user-key")
     tracker = LDAIConfigTracker(client, "variation-key", "config-key", 3, "fakeModel", "fakeProvider", context)
 
-    def raise_exception():
+    async def raise_exception():
         raise ValueError("Something went wrong")
 
     try:
-        tracker.track_openai_metrics(raise_exception)
+        await tracker.track_openai_metrics(raise_exception)
         assert False, "Should have thrown an exception"
     except ValueError:
         pass
diff --git a/ldai/tracker.py b/ldai/tracker.py
index a049952..11b846a 100644
--- a/ldai/tracker.py
+++ b/ldai/tracker.py
@@ -1,7 +1,7 @@
 import time
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 from ldclient import Context, LDClient
 
@@ -144,7 +144,7 @@ def track_duration_of(self, func):
         An exception occurring during the execution of the function will still
         track the duration. The exception will be re-thrown.
 
-        :param func: Function to track.
+        :param func: Function to track (synchronous only).
         :return: Result of the tracked function.
         """
         start_time = time.time()
@@ -157,6 +157,90 @@ def track_duration_of(self, func):
 
         return result
 
+    async def track_metrics_of(self, metrics_extractor, func):
+        """
+        Track metrics for a generic AI operation.
+
+        This function will track the duration of the operation, extract metrics using the provided
+        metrics extractor function, and track success or error status accordingly.
+
+        If the provided function throws, then this method will also throw.
+        In the case the provided function throws, this function will record the duration and an error.
+        A failed operation will not have any token usage data.
+
+        :param metrics_extractor: Function that extracts LDAIMetrics from the operation result
+        :param func: Async function which executes the operation
+        :return: The result of the operation
+        """
+        start_time = time.time()
+        result = None
+        try:
+            result = await func()
+        except Exception as err:
+            end_time = time.time()
+            duration = int((end_time - start_time) * 1000)
+            self.track_duration(duration)
+            self.track_error()
+            raise err
+
+        # Track duration after successful call
+        end_time = time.time()
+        duration = int((end_time - start_time) * 1000)
+        self.track_duration(duration)
+
+        # Extract metrics after successful AI call
+        from ldai.providers.types import LDAIMetrics
+        metrics = metrics_extractor(result)
+
+        # Track success/error based on metrics
+        if metrics.success:
+            self.track_success()
+        else:
+            self.track_error()
+
+        # Track token usage if available
+        if metrics.usage:
+            self.track_tokens(metrics.usage)
+
+        return result
+
+    def track_eval_scores(self, scores: Dict[str, Any]) -> None:
+        """
+        Track evaluation scores for multiple metrics.
+
+        :param scores: Dictionary mapping metric keys to their evaluation scores (EvalScore objects)
+        """
+        from ldai.providers.types import EvalScore
+
+        # Track each evaluation score individually
+        for metric_key, eval_score in scores.items():
+            if isinstance(eval_score, EvalScore):
+                self._ld_client.track(
+                    metric_key,
+                    self._context,
+                    self.__get_track_data(),
+                    eval_score.score
+                )
+
+    def track_judge_response(self, judge_response: Any) -> None:
+        """
+        Track a judge response, including evaluation scores and success status.
+
+        :param judge_response: JudgeResponse object containing evals and success status
+        """
+        from ldai.providers.types import JudgeResponse
+
+        if isinstance(judge_response, JudgeResponse):
+            # Track evaluation scores
+            if judge_response.evals:
+                self.track_eval_scores(judge_response.evals)
+
+            # Track success/error based on judge response
+            if judge_response.success:
+                self.track_success()
+            else:
+                self.track_error()
+
     def track_feedback(self, feedback: Dict[str, FeedbackKind]) -> None:
         """
         Track user feedback for an AI operation.
@@ -197,7 +281,7 @@ def track_error(self) -> None:
             "$ld:ai:generation:error", self._context, self.__get_track_data(), 1
         )
 
-    def track_openai_metrics(self, func):
+    async def track_openai_metrics(self, func):
         """
         Track OpenAI-specific operations.
 
@@ -211,15 +295,22 @@ def track_openai_metrics(self, func):
 
         A failed operation will not have any token usage data.
 
-        :param func: Function to track.
+        :param func: Async function to track.
         :return: Result of the tracked function.
         """
+        start_time = time.time()
         try:
-            result = self.track_duration_of(func)
+            result = await func()
+            end_time = time.time()
+            duration = int((end_time - start_time) * 1000)
+            self.track_duration(duration)
             self.track_success()
             if hasattr(result, "usage") and hasattr(result.usage, "to_dict"):
                 self.track_tokens(_openai_to_token_usage(result.usage.to_dict()))
         except Exception:
+            end_time = time.time()
+            duration = int((end_time - start_time) * 1000)
+            self.track_duration(duration)
             self.track_error()
             raise
 
diff --git a/pyproject.toml b/pyproject.toml
index 200215c..9c1f44a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ chevron = "=0.14.0"
 pytest = ">=2.8"
 pytest-cov = ">=2.4.0"
 pytest-mypy = "==1.0.1"
+pytest-asyncio = ">=0.21.0"
 mypy = "==1.18.2"
 pycodestyle = "^2.12.1"
 isort = ">=5.13.2,<7.0.0"
diff --git a/setup.cfg b/setup.cfg
index c178190..1fb1827 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,2 @@
 [pycodestyle]
-ignore = E501
+ignore = E501,W503