From 486295ab5a467d8dbc19251fdb1178e9ca736154 Mon Sep 17 00:00:00 2001
From: itzlambda <git@itzlambda.com>
Date: Sun, 5 Oct 2025 18:04:49 +0530
Subject: [PATCH] refactor(middleware): remove retry and circuit breaker logic

- Remove RetryPolicy configuration and related retry logic
- Remove circuit breaker functionality from middleware stack
- Simplify middleware to focus on core features: rate limiting, timeouts, connection pooling, and metrics
- Update Cargo.toml to remove retry Tower feature
- Update CLAUDE.md to reflect simplified middleware architecture
- Remove retry_policy_example.rs and simplify middleware_usage.rs example
- Update all example documentation to remove references to retry/circuit breaker features

BREAKING CHANGE: Retry and circuit breaker middleware has been removed. Applications relying on these features will need to implement them at the application level or use external libraries.
---
 CLAUDE.md                                     |   5 +-
 Cargo.toml                                    |   2 +-
 crates/rullm-core/examples/README.md          |  10 +-
 .../rullm-core/examples/anthropic_stream.rs   |   4 +-
 crates/rullm-core/examples/basic_usage.rs     |  12 +-
 crates/rullm-core/examples/gemini_stream.rs   |   2 +-
 .../rullm-core/examples/middleware_usage.rs   |  89 +----
 .../examples/retry_policy_example.rs          |  50 ---
 crates/rullm-core/src/config.rs               | 236 +----------
 crates/rullm-core/src/error.rs                |  28 --
 crates/rullm-core/src/lib.rs                  |  52 +--
 crates/rullm-core/src/middleware.rs           | 133 +------
 crates/rullm-core/src/providers/google.rs     |   2 +-
 crates/rullm-core/src/simple.rs               |  12 -
 crates/rullm-core/src/tests.rs                | 372 +-----------------
 15 files changed, 70 insertions(+), 939 deletions(-)
 delete mode 100644 crates/rullm-core/examples/retry_policy_example.rs

diff --git a/CLAUDE.md b/CLAUDE.md
index 32cd4aa9..763940cd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -29,12 +29,11 @@ The `openai_compatible` provider is a generic implementation that other provider
 
 ### Middleware Stack
 
-The library uses Tower middleware for enterprise features (see `crates/rullm-core/src/middleware.rs`):
-- Retry logic with exponential backoff
+The library uses Tower middleware (see `crates/rullm-core/src/middleware.rs`):
 - Rate limiting
-- Circuit breakers
 - Timeouts
 - Connection pooling
+- Logging and metrics
 
 Configuration is done via `MiddlewareConfig` and `LlmServiceBuilder`.
 
diff --git a/Cargo.toml b/Cargo.toml
index b0336069..26ba3f6a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,7 +10,7 @@ rust-version = "1.85"
 [workspace.dependencies]
 # Core library dependencies
 tokio = { version = "1", features = ["full"] }
-tower = { version = "0.4", features = ["timeout", "retry", "limit", "util"] }
+tower = { version = "0.4", features = ["timeout", "limit", "util"] }
 rand = "0.8"
 reqwest = { version = "0.11", features = ["json", "stream"] }
 bytes = "1.0"
diff --git a/crates/rullm-core/examples/README.md b/crates/rullm-core/examples/README.md
index 56967238..3028f325 100644
--- a/crates/rullm-core/examples/README.md
+++ b/crates/rullm-core/examples/README.md
@@ -40,7 +40,7 @@ All streaming examples use the `chat_completion_stream` method which returns a `
 
 **Environment:** Requires `OPENAI_API_KEY`
 
-Demonstrates comprehensive OpenAI streaming with:
+Demonstrates OpenAI streaming with:
 - **Simple streaming chat** with real-time token display
 - **Multi-turn conversations** with context preservation  
 - **Creative writing** with high temperature settings
@@ -87,7 +87,7 @@ while let Some(event) = stream.next().await {
 
 **Environment:** Requires `ANTHROPIC_API_KEY`
 
-Showcases Claude's capabilities with:
+Shows Claude streaming with:
 - **Philosophical conversations** demonstrating reasoning abilities
 - **Creative storytelling** with vivid imagery
 - **Code explanation** with technical accuracy
@@ -124,7 +124,7 @@ let mut stream = provider
 
 **Environment:** Requires `GOOGLE_API_KEY`
 
-Highlights Gemini's versatility:
+Shows Gemini streaming with:
 - **Technical explanations** with precision
 - **Creative writing** using experimental models
 - **Code analysis** and review capabilities
@@ -265,7 +265,7 @@ Demonstrates:
 
 Key features:
 - **Environment-based configuration**
-- **Custom endpoints** for enterprise setups
+- **Custom endpoints** for custom API URLs
 - **Validation and error handling**
 - **Health checks** and model availability
 - **Request builder patterns** from minimal to full-featured
@@ -434,7 +434,7 @@ cargo run --example test_all_providers
 🎉 All providers are working correctly!
 ```
 
-This example is perfect for:
+Use this example for:
 - Verifying your API keys work
 - Testing network connectivity
 - Validating provider implementations
diff --git a/crates/rullm-core/examples/anthropic_stream.rs b/crates/rullm-core/examples/anthropic_stream.rs
index 40cb5003..08d427af 100644
--- a/crates/rullm-core/examples/anthropic_stream.rs
+++ b/crates/rullm-core/examples/anthropic_stream.rs
@@ -183,8 +183,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("\n\n🎯 Tips for using Anthropic Claude streaming:");
     println!("• Set ANTHROPIC_API_KEY environment variable");
     println!("• Use .stream(true) in ChatRequestBuilder");
-    println!("• Claude models: haiku (fast), sonnet (balanced), opus (powerful)");
-    println!("• Claude excels at reasoning, analysis, and creative writing");
+    println!("• Claude models: haiku (fast), sonnet (balanced), opus (largest)");
+    println!("• Claude supports reasoning, analysis, and creative writing");
     println!("• Lower temperature (0.1-0.4) for factual content");
     println!("• Higher temperature (0.7-1.0) for creative content");
 
diff --git a/crates/rullm-core/examples/basic_usage.rs b/crates/rullm-core/examples/basic_usage.rs
index deacdad0..4aab6d33 100644
--- a/crates/rullm-core/examples/basic_usage.rs
+++ b/crates/rullm-core/examples/basic_usage.rs
@@ -1,4 +1,4 @@
-use rullm_core::{ChatRequestBuilder, LlmError};
+use rullm_core::ChatRequestBuilder;
 
 // This example demonstrates the unified interface without actual provider implementations
 // It shows how the library would be used once provider modules are implemented
@@ -33,15 +33,5 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("\nThis example shows the unified interface design.");
     println!("Actual provider implementations will be added in subsequent tasks.");
 
-    // Example of error handling
-    let error_example = LlmError::rate_limit(
-        "Too many requests",
-        Some(std::time::Duration::from_secs(60)),
-    );
-    println!("\nError handling example:");
-    println!("  Error: {error_example}");
-    println!("  Is retryable: {}", error_example.is_retryable());
-    println!("  Retry delay: {:?}", error_example.retry_delay());
-
     Ok(())
 }
diff --git a/crates/rullm-core/examples/gemini_stream.rs b/crates/rullm-core/examples/gemini_stream.rs
index f465e8c4..b46a18d1 100644
--- a/crates/rullm-core/examples/gemini_stream.rs
+++ b/crates/rullm-core/examples/gemini_stream.rs
@@ -219,7 +219,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!(
         "• Models: gemini-1.5-flash (fast), gemini-1.5-pro (balanced), gemini-2.0-flash-exp (experimental)"
     );
-    println!("• Gemini excels at reasoning, code analysis, and creative tasks");
+    println!("• Gemini supports reasoning, code analysis, and creative tasks");
     println!("• Lower temperature (0.1-0.4) for factual/technical content");
     println!("• Higher temperature (0.7-1.0) for creative content");
     println!("• Use top_p for more controlled randomness");
diff --git a/crates/rullm-core/examples/middleware_usage.rs b/crates/rullm-core/examples/middleware_usage.rs
index 2474434c..017d6c0f 100644
--- a/crates/rullm-core/examples/middleware_usage.rs
+++ b/crates/rullm-core/examples/middleware_usage.rs
@@ -1,6 +1,6 @@
 use rullm_core::{
     ChatRequestBuilder, ConfigBuilder, LlmServiceBuilder, MiddlewareConfig, OpenAIProvider,
-    RateLimit, config::RetryPolicy,
+    RateLimit,
 };
 use std::time::Duration;
 
@@ -14,16 +14,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Example 1: Basic middleware stack with defaults
     basic_middleware_example().await?;
 
-    // Example 2: Custom retry policy with exponential backoff
-    custom_retry_example().await?;
-
-    // Example 3: Production-ready configuration
+    // Example 2: Configuration with timeouts and rate limiting
     production_config_example().await?;
 
-    // Example 4: Rate-limited and monitored configuration
+    // Example 3: Rate-limited and monitored configuration
     rate_limited_example().await?;
 
-    // Example 5: Custom middleware configuration
+    // Example 4: Custom middleware configuration
     custom_middleware_config_example().await?;
 
     Ok(())
@@ -57,83 +54,40 @@ async fn basic_middleware_example() -> Result<(), Box<dyn std::error::Error>> {
     Ok(())
 }
 
-/// Example 2: Custom retry policy with exponential backoff
-async fn custom_retry_example() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🔄 Example 2: Custom Retry Policy");
-
-    let config = ConfigBuilder::openai_from_env()?;
-    let provider = OpenAIProvider::new(config)?;
-
-    // Create middleware with custom exponential backoff retry policy
-    let mut middleware_stack = LlmServiceBuilder::new()
-        .timeout(Duration::from_secs(60)) // 60 second timeout
-        .retry(RetryPolicy::ExponentialBackoff {
-            initial_delay_ms: 200, // Start with 200ms
-            max_delay_ms: 10000,   // Cap at 10 seconds
-            multiplier: 2.5,       // Aggressive backoff
-            jitter: true,          // Add randomness
-        })
-        .logging()
-        .build(provider, "gpt-3.5-turbo".to_string());
-
-    let request = ChatRequestBuilder::new()
-        .user("Explain quantum computing in simple terms")
-        .temperature(0.7)
-        .max_tokens(150)
-        .build();
-
-    let response = middleware_stack.call(request).await?;
-
-    println!("✅ Response: {}", response.message.content);
-    println!("🔄 Retry policy: Exponential backoff with jitter\n");
-
-    Ok(())
-}
-
-/// Example 3: Production-ready configuration
+/// Example 2: Configuration with timeouts and rate limiting
 async fn production_config_example() -> Result<(), Box<dyn std::error::Error>> {
-    println!("🏭 Example 3: Production Configuration");
+    println!("🏭 Example 2: Configuration with Timeouts and Rate Limiting");
 
     let config = ConfigBuilder::openai_from_env()?;
     let provider = OpenAIProvider::new(config)?;
 
-    // Production-ready middleware configuration
+    // Middleware configuration with timeouts and rate limiting
     let mut middleware_stack = LlmServiceBuilder::new()
         .timeout(Duration::from_secs(30)) // Conservative timeout
-        .retry(RetryPolicy::ApiGuided {
-            fallback: Box::new(RetryPolicy::ExponentialBackoff {
-                initial_delay_ms: 100,
-                max_delay_ms: 5000,
-                multiplier: 2.0,
-                jitter: true,
-            }),
-            max_api_delay_ms: 30000, // Don't wait more than 30 seconds
-            retry_headers: vec!["retry-after".to_string(), "x-ratelimit-reset".to_string()],
-        })
         .rate_limit(100, Duration::from_secs(60)) // 100 requests per minute
-        .logging() // Always log in production
-        .metrics() // Always collect metrics
+        .logging()
+        .metrics()
         .build(provider, "gpt-4".to_string());
 
     let request = ChatRequestBuilder::new()
         .system("You are a helpful assistant for a production application.")
         .user("How can I optimize my database queries?")
-        .temperature(0.3) // More deterministic for production
+        .temperature(0.3) // Lower temperature for more deterministic output
         .max_tokens(300)
         .build();
 
     let response = middleware_stack.call(request).await?;
 
-    println!("✅ Production response received");
+    println!("✅ Response received");
     println!("📊 Token usage: {}", response.usage.total_tokens);
-    println!("🛡️ Configuration: API-guided retry, rate limited, fully monitored\n");
+    println!("🛡️ Configuration: Rate limited, logged and monitored\n");
 
     Ok(())
 }
 
-/// Example 4: Rate-limited and monitored configuration
+/// Example 3: Rate-limited and monitored configuration
 async fn rate_limited_example() -> Result<(), Box<dyn std::error::Error>> {
-    println!("⏱️ Example 4: Rate Limited Configuration");
+    println!("⏱️ Example 3: Rate Limited Configuration");
 
     let config = ConfigBuilder::openai_from_env()?;
     let provider = OpenAIProvider::new(config)?;
@@ -141,7 +95,6 @@ async fn rate_limited_example() -> Result<(), Box<dyn std::error::Error>> {
     // Configuration optimized for rate limiting and monitoring
     let mut middleware_stack = LlmServiceBuilder::new()
         .timeout(Duration::from_secs(45))
-        .retry(RetryPolicy::Fixed { delay_ms: 1000 }) // Simple fixed delay
         .rate_limit(50, Duration::from_secs(60)) // Conservative rate limit
         .logging()
         .metrics()
@@ -184,9 +137,9 @@ async fn rate_limited_example() -> Result<(), Box<dyn std::error::Error>> {
     Ok(())
 }
 
-/// Example 5: Custom middleware configuration from struct
+/// Example 4: Custom middleware configuration from struct
 async fn custom_middleware_config_example() -> Result<(), Box<dyn std::error::Error>> {
-    println!("⚙️ Example 5: Custom Middleware Configuration");
+    println!("⚙️ Example 4: Custom Middleware Configuration");
 
     let config = ConfigBuilder::openai_from_env()?;
     let provider = OpenAIProvider::new(config)?;
@@ -194,12 +147,6 @@ async fn custom_middleware_config_example() -> Result<(), Box<dyn std::error::Er
     // Define custom middleware configuration
     let middleware_config = MiddlewareConfig {
         timeout: Some(Duration::from_secs(20)),
-        retry_policy: Some(RetryPolicy::ExponentialBackoff {
-            initial_delay_ms: 500,
-            max_delay_ms: 8000,
-            multiplier: 1.8,
-            jitter: false,
-        }),
         rate_limit: Some(RateLimit {
             requests_per_period: 25,
             period: Duration::from_secs(60),
@@ -225,9 +172,7 @@ async fn custom_middleware_config_example() -> Result<(), Box<dyn std::error::Er
         "📊 Response length: {} characters",
         response.message.content.len()
     );
-    println!(
-        "⚙️ Configuration: Custom timeouts, exponential backoff (no jitter), 25 req/min limit\n"
-    );
+    println!("⚙️ Configuration: Custom timeouts, 25 req/min limit\n");
 
     // Display the configuration details
     let config = middleware_stack.config();
diff --git a/crates/rullm-core/examples/retry_policy_example.rs b/crates/rullm-core/examples/retry_policy_example.rs
deleted file mode 100644
index 8de894ed..00000000
--- a/crates/rullm-core/examples/retry_policy_example.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use rullm_core::config::{ConfigBuilder, ProviderConfig, RetryPolicy};
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("=== LLM Retry Policy Examples ===\n");
-
-    // Example 1: Simple fixed retry
-    let config = ConfigBuilder::openai_from_env()?.with_fixed_retry(3, 2000); // 3 retries, 2s delay
-
-    println!("1. Fixed Retry Policy:");
-    println!("   Max retries: {}", config.max_retries());
-    println!("   Policy: {:?}\n", config.retry_policy());
-
-    // Example 2: Exponential backoff
-    let config =
-        ConfigBuilder::openai_from_env()?.with_exponential_backoff(5, 1000, 30000, 2.0, true);
-
-    println!("2. Exponential Backoff:");
-    println!("   Max retries: {}", config.max_retries());
-    println!("   Policy: {:?}\n", config.retry_policy());
-
-    // Example 3: API-guided retry (respects rate limit headers)
-    let fallback = RetryPolicy::ExponentialBackoff {
-        initial_delay_ms: 1000,
-        max_delay_ms: 15000,
-        multiplier: 2.0,
-        jitter: true,
-    };
-
-    let config = ConfigBuilder::openai_from_env()?.with_api_guided_retry(3, fallback, 60000); // Max 60s from API
-
-    println!("3. API-Guided Retry (Smart):");
-    println!("   Max retries: {}", config.max_retries());
-    println!("   Policy: {:?}\n", config.retry_policy());
-
-    // Example 4: Smart retry (default - API-guided with good fallback)
-    let config = ConfigBuilder::openai_from_env()?.with_smart_retry(5);
-
-    println!("4. Smart Retry (Default):");
-    println!("   Max retries: {}", config.max_retries());
-    println!("   Policy: {:?}\n", config.retry_policy());
-
-    println!("=== How API-Guided Retry Works ===");
-    println!("When a 429 (rate limit) response is received:");
-    println!("1. Check for 'Retry-After' header -> use that delay");
-    println!("2. Check for 'X-RateLimit-Reset' header -> calculate delay");
-    println!("3. If no headers found -> use fallback policy");
-    println!("4. Respect max_api_delay_ms limit (prevents infinite waits)");
-
-    Ok(())
-}
diff --git a/crates/rullm-core/src/config.rs b/crates/rullm-core/src/config.rs
index f2dcd6cd..be8a3488 100644
--- a/crates/rullm-core/src/config.rs
+++ b/crates/rullm-core/src/config.rs
@@ -1,49 +1,6 @@
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
-
-/// Retry policy configuration
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub enum RetryPolicy {
-    /// Fixed delay between retries
-    Fixed { delay_ms: u64 },
-    /// Exponential backoff with optional jitter
-    ExponentialBackoff {
-        initial_delay_ms: u64,
-        max_delay_ms: u64,
-        multiplier: f64,
-        jitter: bool,
-    },
-    /// Respect API-provided retry timing from response headers, with fallback policy
-    ApiGuided {
-        /// Fallback policy when no API guidance is available
-        fallback: Box<RetryPolicy>,
-        /// Maximum time to wait based on API guidance (prevents indefinite waits)
-        max_api_delay_ms: u64,
-        /// Headers to check for retry timing (in order of preference)
-        retry_headers: Vec<String>,
-    },
-}
-
-impl Default for RetryPolicy {
-    fn default() -> Self {
-        RetryPolicy::ApiGuided {
-            fallback: Box::new(RetryPolicy::ExponentialBackoff {
-                initial_delay_ms: 1000,
-                max_delay_ms: 30000,
-                multiplier: 2.0,
-                jitter: true,
-            }),
-            max_api_delay_ms: 60000, // Max 1 minute wait from API guidance
-            retry_headers: vec![
-                "retry-after".to_string(),
-                "x-ratelimit-reset".to_string(),
-                "x-ratelimit-reset-after".to_string(),
-                "reset-time".to_string(),
-            ],
-        }
-    }
-}
+use std::time::Duration;
 
 /// Configuration trait for LLM providers
 pub trait ProviderConfig: Send + Sync {
@@ -59,12 +16,6 @@ pub trait ProviderConfig: Send + Sync {
     /// Get any additional headers required by the provider
     fn headers(&self) -> HashMap<String, String>;
 
-    /// Get maximum number of retry attempts
-    fn max_retries(&self) -> u32;
-
-    /// Get retry policy configuration
-    fn retry_policy(&self) -> RetryPolicy;
-
     /// Validate the configuration
     fn validate(&self) -> Result<(), crate::error::LlmError>;
 }
@@ -76,8 +27,6 @@ pub struct HttpProviderConfig {
     pub base_url: String,
     pub timeout_seconds: u64,
     pub headers: HashMap<String, String>,
-    pub max_retries: u32,
-    pub retry_delay_ms: u64,
 }
 
 impl HttpProviderConfig {
@@ -87,8 +36,6 @@ impl HttpProviderConfig {
             base_url: base_url.into(),
             timeout_seconds: 30,
             headers: HashMap::new(),
-            max_retries: 3,
-            retry_delay_ms: 1000,
         }
     }
 
@@ -101,12 +48,6 @@ impl HttpProviderConfig {
         self.headers.insert(key.into(), value.into());
         self
     }
-
-    pub fn with_retries(mut self, max_retries: u32, delay_ms: u64) -> Self {
-        self.max_retries = max_retries;
-        self.retry_delay_ms = delay_ms;
-        self
-    }
 }
 
 impl ProviderConfig for HttpProviderConfig {
@@ -126,16 +67,6 @@ impl ProviderConfig for HttpProviderConfig {
         self.headers.clone()
     }
 
-    fn max_retries(&self) -> u32 {
-        self.max_retries
-    }
-
-    fn retry_policy(&self) -> RetryPolicy {
-        RetryPolicy::Fixed {
-            delay_ms: self.retry_delay_ms,
-        }
-    }
-
     fn validate(&self) -> Result<(), crate::error::LlmError> {
         if self.api_key.is_empty() {
             return Err(crate::error::LlmError::configuration("API key is required"));
@@ -165,8 +96,6 @@ pub struct OpenAICompatibleConfig {
     pub project: Option<String>,
     pub base_url: Option<String>,
     pub timeout_seconds: u64,
-    pub max_retries: u32,
-    pub retry_policy: RetryPolicy,
 }
 
 /// Type alias for backwards compatibility
@@ -180,8 +109,6 @@ impl OpenAICompatibleConfig {
             project: None,
             base_url: None,
             timeout_seconds: 30,
-            max_retries: 3,
-            retry_policy: RetryPolicy::default(),
         }
     }
 
@@ -192,8 +119,6 @@ impl OpenAICompatibleConfig {
             project: None,
             base_url: Some("https://api.groq.com/openai/v1".to_string()),
             timeout_seconds: 30,
-            max_retries: 3,
-            retry_policy: RetryPolicy::default(),
         }
     }
 
@@ -204,8 +129,6 @@ impl OpenAICompatibleConfig {
             project: None,
             base_url: Some("https://openrouter.ai/api/v1".to_string()),
             timeout_seconds: 30,
-            max_retries: 3,
-            retry_policy: RetryPolicy::default(),
         }
     }
 
@@ -223,62 +146,6 @@ impl OpenAICompatibleConfig {
         self.base_url = Some(base_url.into());
         self
     }
-
-    pub fn with_retry_policy(mut self, retry_policy: RetryPolicy) -> Self {
-        self.retry_policy = retry_policy;
-        self
-    }
-
-    pub fn with_fixed_retry(mut self, max_retries: u32, delay_ms: u64) -> Self {
-        self.max_retries = max_retries;
-        self.retry_policy = RetryPolicy::Fixed { delay_ms };
-        self
-    }
-
-    pub fn with_exponential_backoff(
-        mut self,
-        max_retries: u32,
-        initial_delay_ms: u64,
-        max_delay_ms: u64,
-        multiplier: f64,
-        jitter: bool,
-    ) -> Self {
-        self.max_retries = max_retries;
-        self.retry_policy = RetryPolicy::ExponentialBackoff {
-            initial_delay_ms,
-            max_delay_ms,
-            multiplier,
-            jitter,
-        };
-        self
-    }
-
-    pub fn with_api_guided_retry(
-        mut self,
-        max_retries: u32,
-        fallback_policy: RetryPolicy,
-        max_api_delay_ms: u64,
-    ) -> Self {
-        self.max_retries = max_retries;
-        self.retry_policy = RetryPolicy::ApiGuided {
-            fallback: Box::new(fallback_policy),
-            max_api_delay_ms,
-            retry_headers: vec![
-                "retry-after".to_string(),
-                "x-ratelimit-reset".to_string(),
-                "x-ratelimit-reset-after".to_string(),
-                "reset-time".to_string(),
-            ],
-        };
-        self
-    }
-
-    pub fn with_smart_retry(mut self, max_retries: u32) -> Self {
-        // Smart retry: API-guided with exponential backoff fallback
-        self.max_retries = max_retries;
-        self.retry_policy = RetryPolicy::default();
-        self
-    }
 }
 
 impl ProviderConfig for OpenAICompatibleConfig {
@@ -315,14 +182,6 @@ impl ProviderConfig for OpenAICompatibleConfig {
         headers
     }
 
-    fn max_retries(&self) -> u32 {
-        self.max_retries
-    }
-
-    fn retry_policy(&self) -> RetryPolicy {
-        self.retry_policy.clone()
-    }
-
     fn validate(&self) -> Result<(), crate::error::LlmError> {
         if self.api_key.is_empty() {
             return Err(crate::error::LlmError::configuration("API key is required"));
@@ -341,8 +200,6 @@ pub struct AnthropicConfig {
     pub api_key: String,
     pub base_url: Option<String>,
     pub timeout_seconds: u64,
-    pub max_retries: u32,
-    pub retry_policy: RetryPolicy,
 }
 
 impl AnthropicConfig {
@@ -351,8 +208,6 @@ impl AnthropicConfig {
             api_key: api_key.into(),
             base_url: None,
             timeout_seconds: 30,
-            max_retries: 3,
-            retry_policy: RetryPolicy::default(),
         }
     }
 
@@ -385,14 +240,6 @@ impl ProviderConfig for AnthropicConfig {
         headers
     }
 
-    fn max_retries(&self) -> u32 {
-        self.max_retries
-    }
-
-    fn retry_policy(&self) -> RetryPolicy {
-        self.retry_policy.clone()
-    }
-
     fn validate(&self) -> Result<(), crate::error::LlmError> {
         if self.api_key.is_empty() {
             return Err(crate::error::LlmError::configuration(
@@ -410,8 +257,6 @@ pub struct GoogleAiConfig {
     pub api_key: String,
     pub base_url: Option<String>,
     pub timeout_seconds: u64,
-    pub max_retries: u32,
-    pub retry_policy: RetryPolicy,
 }
 
 impl GoogleAiConfig {
@@ -420,8 +265,6 @@ impl GoogleAiConfig {
             api_key: api_key.into(),
             base_url: None,
             timeout_seconds: 30,
-            max_retries: 3,
-            retry_policy: RetryPolicy::default(),
         }
     }
 
@@ -452,14 +295,6 @@ impl ProviderConfig for GoogleAiConfig {
         headers
     }
 
-    fn max_retries(&self) -> u32 {
-        self.max_retries
-    }
-
-    fn retry_policy(&self) -> RetryPolicy {
-        self.retry_policy.clone()
-    }
-
     fn validate(&self) -> Result<(), crate::error::LlmError> {
         if self.api_key.is_empty() {
             return Err(crate::error::LlmError::configuration(
@@ -563,72 +398,3 @@ impl ConfigBuilder {
         Ok(config)
     }
 }
-
-/// Utility functions for parsing retry timing from HTTP headers
-pub mod retry_parsing {
-    use super::*;
-    use std::str::FromStr;
-
-    /// Parse retry delay from HTTP response headers
-    pub fn parse_retry_delay(
-        headers: &HashMap<String, String>,
-        retry_headers: &[String],
-        max_delay_ms: u64,
-    ) -> Option<Duration> {
-        for header_name in retry_headers {
-            if let Some(value) = headers.get(header_name) {
-                if let Some(delay) = parse_single_header(header_name, value, max_delay_ms) {
-                    return Some(delay);
-                }
-            }
-        }
-        None
-    }
-
-    fn parse_single_header(header_name: &str, value: &str, max_delay_ms: u64) -> Option<Duration> {
-        match header_name.to_lowercase().as_str() {
-            "retry-after" => parse_retry_after(value, max_delay_ms),
-            "x-ratelimit-reset" => parse_reset_timestamp(value, max_delay_ms),
-            "x-ratelimit-reset-after" => parse_seconds(value, max_delay_ms),
-            "reset-time" => parse_reset_timestamp(value, max_delay_ms),
-            _ => None,
-        }
-    }
-
-    fn parse_retry_after(value: &str, max_delay_ms: u64) -> Option<Duration> {
-        // Retry-After can be either seconds or HTTP date
-        if let Ok(seconds) = u64::from_str(value.trim()) {
-            let delay_ms = seconds * 1000;
-            if delay_ms <= max_delay_ms {
-                return Some(Duration::from_millis(delay_ms));
-            }
-        }
-        // TODO: Add HTTP date parsing if needed
-        None
-    }
-
-    fn parse_reset_timestamp(value: &str, max_delay_ms: u64) -> Option<Duration> {
-        if let Ok(timestamp) = u64::from_str(value.trim()) {
-            let now = SystemTime::now().duration_since(UNIX_EPOCH).ok()?.as_secs();
-
-            if timestamp > now {
-                let delay_seconds = timestamp - now;
-                let delay_ms = delay_seconds * 1000;
-                if delay_ms <= max_delay_ms {
-                    return Some(Duration::from_millis(delay_ms));
-                }
-            }
-        }
-        None
-    }
-
-    fn parse_seconds(value: &str, max_delay_ms: u64) -> Option<Duration> {
-        if let Ok(seconds) = u64::from_str(value.trim()) {
-            let delay_ms = seconds * 1000;
-            if delay_ms <= max_delay_ms {
-                return Some(Duration::from_millis(delay_ms));
-            }
-        }
-        None
-    }
-}
diff --git a/crates/rullm-core/src/error.rs b/crates/rullm-core/src/error.rs
index e58af07a..702299cd 100644
--- a/crates/rullm-core/src/error.rs
+++ b/crates/rullm-core/src/error.rs
@@ -195,34 +195,6 @@ impl LlmError {
             source: Some(source.into()),
         }
     }
-
-    /// Check if the error is retryable
-    pub fn is_retryable(&self) -> bool {
-        match self {
-            LlmError::Network { .. } => true,
-            LlmError::RateLimit { .. } => true,
-            LlmError::Timeout { .. } => true,
-            LlmError::ServiceUnavailable { .. } => true,
-            LlmError::Api { code, .. } => {
-                // Some API errors are retryable (e.g., 500, 502, 503, 504)
-                code.as_ref()
-                    .map(|c| matches!(c.as_str(), "500" | "502" | "503" | "504"))
-                    .unwrap_or(false)
-            }
-            _ => false,
-        }
-    }
-
-    /// Get retry delay for retryable errors
-    pub fn retry_delay(&self) -> Option<std::time::Duration> {
-        match self {
-            LlmError::RateLimit { retry_after, .. } => *retry_after,
-            LlmError::Network { .. } => Some(std::time::Duration::from_secs(1)),
-            LlmError::Timeout { .. } => Some(std::time::Duration::from_millis(500)),
-            LlmError::ServiceUnavailable { .. } => Some(std::time::Duration::from_secs(5)),
-            _ => None,
-        }
-    }
 }
 
 /// Convert from reqwest errors
diff --git a/crates/rullm-core/src/lib.rs b/crates/rullm-core/src/lib.rs
index 4775f34a..fca569cf 100644
--- a/crates/rullm-core/src/lib.rs
+++ b/crates/rullm-core/src/lib.rs
@@ -1,18 +1,17 @@
 //! # rullm-core - Rust LLM Library
 //!
-//! A high-performance Rust library for interacting with Large Language Models (LLMs).
-//! Built with Tower middleware for enterprise-grade reliability, featuring retry logic,
-//! rate limiting, circuit breakers, and comprehensive error handling.
+//! A Rust library for interacting with Large Language Models (LLMs).
+//! Built with Tower middleware, featuring rate limiting and error handling.
 //!
 //! ## Features
 //!
-//! - **Multiple LLM Providers** - OpenAI, Anthropic, Google AI
-//! - **High Performance** - Built on Tower with connection pooling and async/await
-//! - **Enterprise Ready** - Retry logic, rate limiting, circuit breakers, timeouts
-//! - **Dual APIs** - Simple string-based API + advanced API with full control
-//! - **Real-time Streaming** - Stream responses token by token as they're generated
-//! - **Well Tested** - Comprehensive test suite with examples
-//! - **Observability** - Built-in metrics, logging, and error handling
+//! - Multiple LLM Providers (OpenAI, Anthropic, Google AI)
+//! - Tower middleware with connection pooling and async/await
+//! - Rate limiting, timeouts, and error handling
+//! - Dual APIs: Simple string-based API and advanced API with full control
+//! - Streaming support for token-by-token responses
+//! - Test suite with examples
+//! - Metrics, logging, and error handling
 //!
 //! ## Quick Start
 //!
@@ -33,19 +32,19 @@
 //! ### Advanced API (Full Control)
 //!
 //! ```rust,no_run
-//! use rullm_core::{OpenAIConfig, OpenAIProvider, ChatProvider, ChatRequestBuilder, ChatRole};
+//! use rullm_core::{OpenAIConfig, OpenAIProvider, ChatCompletion, ChatRequestBuilder, ChatRole};
 //!
 //! #[tokio::main]
 //! async fn main() -> Result<(), Box<dyn std::error::Error>> {
 //!     let config = OpenAIConfig::new("your-api-key");
 //!     let provider = OpenAIProvider::new(config)?;
-//!     
+//!
 //!     let request = ChatRequestBuilder::new()
 //!         .user("Hello, world!")
 //!         .temperature(0.7)
 //!         .max_tokens(100)
 //!         .build();
-//!     
+//!
 //!     let response = provider.chat_completion(request, "gpt-3.5-turbo").await?;
 //!     println!("Response: {}", response.message.content);
 //!     Ok(())
@@ -54,7 +53,7 @@
 //!
 //! ## Streaming API Overview
 //!
-//! The streaming API enables real-time token-by-token responses, perfect for interactive
+//! The streaming API enables real-time token-by-token responses for interactive
 //! chat applications and live user experiences.
 //!
 //! ### Core Streaming Types
@@ -66,13 +65,14 @@
 //! ### Basic Streaming Usage
 //!
 //! ```rust,no_run
-//! use rullm_core::{OpenAIProvider, ChatProvider, ChatRequestBuilder, ChatStreamEvent};
+//! use rullm_core::{OpenAIProvider, OpenAIConfig, ChatCompletion, ChatRequestBuilder, ChatStreamEvent};
 //! use futures::StreamExt;
 //!
 //! #[tokio::main]
 //! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let provider = OpenAIProvider::new(/* config */)?;
-//!     
+//!     let config = OpenAIConfig::new("your-api-key");
+//!     let provider = OpenAIProvider::new(config)?;
+//!
 //!     let request = ChatRequestBuilder::new()
 //!         .user("Tell me a story")
 //!         .stream(true) // Enable streaming
@@ -104,7 +104,7 @@
 //!
 //! ### Streaming Examples
 //!
-//! The library includes comprehensive streaming examples for each provider:
+//! The library includes streaming examples for each provider:
 //!
 //! - `openai_stream.rs` - OpenAI GPT models streaming
 //! - `anthropic_stream.rs` - Anthropic Claude models streaming  
@@ -127,19 +127,25 @@
 //!
 //! ## Error Handling
 //!
-//! All operations return [`Result<T, LlmError>`](LlmError) for comprehensive error handling:
+//! All operations return [`Result<T, LlmError>`](LlmError) for error handling:
 //!
-//! ```rust,no_run
-//! use rullm_core::{LlmError, OpenAIProvider, ChatProvider};
+//! ```rust,ignore
+//! use rullm_core::{LlmError, OpenAIProvider, OpenAIConfig, ChatCompletion, ChatRequestBuilder};
 //!
+//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
+//! # let config = OpenAIConfig::new("your-api-key");
+//! # let provider = OpenAIProvider::new(config)?;
+//! # let request = ChatRequestBuilder::new().user("test").build();
 //! match provider.chat_completion(request, "gpt-4").await {
 //!     Ok(response) => println!("Success: {}", response.message.content),
-//!     Err(LlmError::Authentication(_)) => println!("Invalid API key"),
-//!     Err(LlmError::RateLimit { retry_after }) => {
+//!     Err(LlmError::Authentication { .. }) => println!("Invalid API key"),
+//!     Err(LlmError::RateLimit { retry_after, .. }) => {
 //!         println!("Rate limited, retry after: {:?}", retry_after);
 //!     }
 //!     Err(e) => println!("Other error: {}", e),
 //! }
+//! # Ok(())
+//! # }
 //! ```
 
 pub mod config;
diff --git a/crates/rullm-core/src/middleware.rs b/crates/rullm-core/src/middleware.rs
index 6e2dc58f..941174c8 100644
--- a/crates/rullm-core/src/middleware.rs
+++ b/crates/rullm-core/src/middleware.rs
@@ -1,4 +1,3 @@
-use crate::config::RetryPolicy;
 use crate::error::LlmError;
 use crate::types::{
     ChatCompletion, ChatRequest, ChatResponse, ChatStreamEvent, StreamConfig, StreamResult,
@@ -9,16 +8,6 @@ use metrics::{counter, histogram};
 use std::pin::Pin;
 use std::time::{Duration, Instant};
 
-/// Retry attempt information for logging and diagnostics
-#[derive(Debug, Clone)]
-pub struct RetryInfo {
-    pub attempt: u32,
-    pub max_retries: u32,
-    pub delay: Duration,
-    pub reason: String,
-    pub response_status: Option<u16>,
-}
-
 /// Streaming metrics collector
 #[derive(Debug, Clone)]
 pub struct StreamingMetrics {
@@ -169,33 +158,10 @@ where
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_retry_info_creation() {
-        let retry_info = RetryInfo {
-            attempt: 2,
-            max_retries: 3,
-            delay: Duration::from_millis(200),
-            reason: "HTTP 500".to_string(),
-            response_status: Some(500),
-        };
-
-        assert_eq!(retry_info.attempt, 2);
-        assert_eq!(retry_info.max_retries, 3);
-        assert_eq!(retry_info.delay, Duration::from_millis(200));
-        assert_eq!(retry_info.reason, "HTTP 500");
-        assert_eq!(retry_info.response_status, Some(500));
-    }
-}
-
 /// Middleware configuration for LLM providers
 #[derive(Debug, Clone)]
 pub struct MiddlewareConfig {
     pub timeout: Option<Duration>,
-    pub retry_policy: Option<RetryPolicy>,
     pub rate_limit: Option<RateLimit>,
     pub enable_logging: bool,
     pub enable_metrics: bool,
@@ -205,12 +171,6 @@ impl Default for MiddlewareConfig {
     fn default() -> Self {
         Self {
             timeout: Some(Duration::from_secs(30)),
-            retry_policy: Some(RetryPolicy::ExponentialBackoff {
-                initial_delay_ms: 100,
-                max_delay_ms: 5000,
-                multiplier: 2.0,
-                jitter: false,
-            }),
             rate_limit: None,
             enable_logging: true,
             enable_metrics: false,
@@ -251,11 +211,6 @@ impl LlmServiceBuilder {
         self
     }
 
-    pub fn retry(mut self, policy: RetryPolicy) -> Self {
-        self.middleware_config.retry_policy = Some(policy);
-        self
-    }
-
     pub fn rate_limit(mut self, requests_per_period: u64, period: Duration) -> Self {
         let rate_limit = RateLimit {
             requests_per_period,
@@ -309,12 +264,8 @@ where
             );
         }
 
-        // Apply retry logic
-        let result = if let Some(ref policy) = self.config.retry_policy {
-            self.call_with_retry(request, policy).await
-        } else {
-            self.provider.chat_completion(request, &self.model).await
-        };
+        // Make the request
+        let result = self.provider.chat_completion(request, &self.model).await;
 
         // Apply metrics and logging
         match &result {
@@ -351,68 +302,7 @@ where
         result
     }
 
-    async fn call_with_retry(
-        &self,
-        request: ChatRequest,
-        policy: &RetryPolicy,
-    ) -> Result<ChatResponse, LlmError> {
-        let max_retries = 3; // Could be configurable
-        let mut attempt = 0;
-
-        loop {
-            match self
-                .provider
-                .chat_completion(request.clone(), &self.model)
-                .await
-            {
-                Ok(response) => return Ok(response),
-                Err(error) => {
-                    if attempt >= max_retries || !error.is_retryable() {
-                        return Err(error);
-                    }
-
-                    let delay = match policy {
-                        RetryPolicy::Fixed { delay_ms } => Duration::from_millis(*delay_ms),
-                        RetryPolicy::ExponentialBackoff {
-                            initial_delay_ms,
-                            max_delay_ms,
-                            multiplier,
-                            jitter: _,
-                        } => {
-                            let delay_ms =
-                                (*initial_delay_ms as f64 * multiplier.powi(attempt)) as u64;
-                            let delay_ms = std::cmp::min(delay_ms, *max_delay_ms);
-                            Duration::from_millis(delay_ms)
-                        }
-                        RetryPolicy::ApiGuided { fallback, .. } => {
-                            // For simplicity, use fallback policy
-                            match fallback.as_ref() {
-                                RetryPolicy::Fixed { delay_ms } => Duration::from_millis(*delay_ms),
-                                RetryPolicy::ExponentialBackoff {
-                                    initial_delay_ms,
-                                    max_delay_ms,
-                                    multiplier,
-                                    jitter: _,
-                                } => {
-                                    let delay_ms = (*initial_delay_ms as f64
-                                        * multiplier.powi(attempt))
-                                        as u64;
-                                    let delay_ms = std::cmp::min(delay_ms, *max_delay_ms);
-                                    Duration::from_millis(delay_ms)
-                                }
-                                RetryPolicy::ApiGuided { .. } => Duration::from_millis(1000), // Fallback
-                            }
-                        }
-                    };
-
-                    tokio::time::sleep(delay).await;
-                    attempt += 1;
-                }
-            }
-        }
-    }
-
-    /// Streaming version with retry logic only for initial call and metrics tracking
+    /// Streaming version with metrics tracking
     pub async fn call_stream(
         &self,
         request: ChatRequest,
@@ -427,18 +317,11 @@ where
             );
         }
 
-        // Apply retry logic only for the initial call
-        let stream = if let Some(ref _policy) = self.config.retry_policy {
-            // For streaming, we can't really retry on a Stream, so just call normally
-            // Retry logic would be complex for streams, so just use direct call for now
-            self.provider
-                .chat_completion_stream(request, &self.model, config)
-                .await
-        } else {
-            self.provider
-                .chat_completion_stream(request, &self.model, config)
-                .await
-        };
+        // Make the streaming request
+        let stream = self
+            .provider
+            .chat_completion_stream(request, &self.model, config)
+            .await;
 
         let metrics_stream = MetricsStream::new(stream, provider_name);
 
diff --git a/crates/rullm-core/src/providers/google.rs b/crates/rullm-core/src/providers/google.rs
index 9c05db82..76c0e598 100644
--- a/crates/rullm-core/src/providers/google.rs
+++ b/crates/rullm-core/src/providers/google.rs
@@ -479,7 +479,7 @@ impl ChatCompletion for GoogleProvider {
             };
 
             // Google now returns Server-Sent Events (SSE) for streaming responses.
-            // Leverage the shared `sse_lines` util to properly parse the stream.
+            // Use the shared `sse_lines` util to parse the stream.
 
             let byte_stream = response.bytes_stream();
             let mut sse_stream = sse_lines(byte_stream);
diff --git a/crates/rullm-core/src/simple.rs b/crates/rullm-core/src/simple.rs
index 6eb84007..3f3d667d 100644
--- a/crates/rullm-core/src/simple.rs
+++ b/crates/rullm-core/src/simple.rs
@@ -29,8 +29,6 @@ pub struct SimpleLlmConfig {
     pub default_top_p: Option<f32>,
     /// Request timeout
     pub timeout: Duration,
-    /// Maximum number of retry attempts
-    pub max_retries: u32,
     /// Whether to validate inputs before sending requests
     pub validate_inputs: bool,
 }
@@ -70,7 +68,6 @@ impl Default for SimpleLlmConfig {
             default_max_tokens: None,
             default_top_p: None,
             timeout: Duration::from_secs(30),
-            max_retries: 3,
             validate_inputs: true,
         }
     }
@@ -136,12 +133,6 @@ impl SimpleLlmConfig {
         self
     }
 
-    /// Set the maximum retry attempts
-    pub fn with_max_retries(mut self, max_retries: u32) -> Self {
-        self.max_retries = max_retries;
-        self
-    }
-
     /// Enable or disable input validation
     pub fn with_validation(mut self, validate: bool) -> Self {
         self.validate_inputs = validate;
@@ -995,7 +986,6 @@ mod tests {
     fn test_simple_llm_config_default() {
         let config = SimpleLlmConfig::default();
         assert_eq!(config.timeout, Duration::from_secs(30));
-        assert_eq!(config.max_retries, 3);
         assert!(config.validate_inputs);
         assert_eq!(config.default_models.openai, "gpt-3.5-turbo");
     }
@@ -1007,7 +997,6 @@ mod tests {
             .with_max_tokens(2000)
             .with_top_p(0.9)
             .with_timeout(Duration::from_secs(60))
-            .with_max_retries(5)
             .with_validation(false)
             .with_openai_model("gpt-4")
             .with_anthropic_model("claude-3-sonnet-20240229")
@@ -1017,7 +1006,6 @@ mod tests {
         assert_eq!(config.default_max_tokens, Some(2000));
         assert_eq!(config.default_top_p, Some(0.9));
         assert_eq!(config.timeout, Duration::from_secs(60));
-        assert_eq!(config.max_retries, 5);
         assert!(!config.validate_inputs);
         assert_eq!(config.default_models.openai, "gpt-4");
         assert_eq!(config.default_models.anthropic, "claude-3-sonnet-20240229");
diff --git a/crates/rullm-core/src/tests.rs b/crates/rullm-core/src/tests.rs
index 882b39a6..7bebc085 100644
--- a/crates/rullm-core/src/tests.rs
+++ b/crates/rullm-core/src/tests.rs
@@ -226,8 +226,8 @@ fn test_openai_config() {
 
     config.validate().unwrap();
 
-    // Test invalid config
-    let invalid_config = OpenAIConfig::new("invalid-key");
+    // Test invalid config (empty API key)
+    let invalid_config = OpenAIConfig::new("");
     assert!(invalid_config.validate().is_err());
 }
 
@@ -264,37 +264,6 @@ fn test_google_ai_config() {
     config.validate().unwrap();
 }
 
-#[test]
-fn test_error_retry_logic() {
-    let network_error = LlmError::network("Connection failed");
-    assert!(network_error.is_retryable());
-    assert_eq!(
-        network_error.retry_delay(),
-        Some(std::time::Duration::from_secs(1))
-    );
-
-    let rate_limit_error = LlmError::rate_limit(
-        "Too many requests",
-        Some(std::time::Duration::from_secs(60)),
-    );
-    assert!(rate_limit_error.is_retryable());
-    assert_eq!(
-        rate_limit_error.retry_delay(),
-        Some(std::time::Duration::from_secs(60))
-    );
-
-    let auth_error = LlmError::authentication("Invalid API key");
-    assert!(!auth_error.is_retryable());
-    assert_eq!(auth_error.retry_delay(), None);
-
-    let api_error_retryable = LlmError::api("test", "Server error", Some("503".to_string()), None);
-    assert!(api_error_retryable.is_retryable());
-
-    let api_error_not_retryable =
-        LlmError::api("test", "Bad request", Some("400".to_string()), None);
-    assert!(!api_error_not_retryable.is_retryable());
-}
-
 #[test]
 fn test_all_llm_error_variants() {
     use std::collections::HashMap;
@@ -425,66 +394,6 @@ fn test_error_conversions() {
     // so we test the error conversion logic indirectly through the provider implementations
 }
 
-#[test]
-fn test_retry_logic_comprehensive() {
-    // Test all retryable error types
-    let network_error = LlmError::network("Connection failed");
-    assert!(network_error.is_retryable());
-    assert_eq!(
-        network_error.retry_delay(),
-        Some(std::time::Duration::from_secs(1))
-    );
-
-    let rate_limit_error =
-        LlmError::rate_limit("Rate limited", Some(std::time::Duration::from_secs(30)));
-    assert!(rate_limit_error.is_retryable());
-    assert_eq!(
-        rate_limit_error.retry_delay(),
-        Some(std::time::Duration::from_secs(30))
-    );
-
-    let timeout_error = LlmError::timeout(std::time::Duration::from_secs(10));
-    assert!(timeout_error.is_retryable());
-    assert_eq!(
-        timeout_error.retry_delay(),
-        Some(std::time::Duration::from_millis(500))
-    );
-
-    let service_error = LlmError::service_unavailable("openai");
-    assert!(service_error.is_retryable());
-    assert_eq!(
-        service_error.retry_delay(),
-        Some(std::time::Duration::from_secs(5))
-    );
-
-    // Test retryable API errors (5xx)
-    let server_error = LlmError::api("test", "Server error", Some("502".to_string()), None);
-    assert!(server_error.is_retryable());
-
-    let gateway_timeout = LlmError::api("test", "Gateway timeout", Some("504".to_string()), None);
-    assert!(gateway_timeout.is_retryable());
-
-    // Test non-retryable error types
-    let auth_error = LlmError::authentication("Invalid key");
-    assert!(!auth_error.is_retryable());
-    assert_eq!(auth_error.retry_delay(), None);
-
-    let config_error = LlmError::configuration("Bad config");
-    assert!(!config_error.is_retryable());
-    assert_eq!(config_error.retry_delay(), None);
-
-    let validation_error = LlmError::validation("Invalid input");
-    assert!(!validation_error.is_retryable());
-    assert_eq!(validation_error.retry_delay(), None);
-
-    // Test non-retryable API errors (4xx)
-    let bad_request = LlmError::api("test", "Bad request", Some("400".to_string()), None);
-    assert!(!bad_request.is_retryable());
-
-    let unauthorized = LlmError::api("test", "Unauthorized", Some("401".to_string()), None);
-    assert!(!unauthorized.is_retryable());
-}
-
 #[test]
 fn test_provider_specific_error_mapping() {
     use std::collections::HashMap;
@@ -592,34 +501,6 @@ fn test_provider_specific_error_mapping() {
             &serde_json::Value::String("RATE_LIMIT_EXCEEDED".to_string())
         );
     }
-
-    // Test retryable server errors (500, 502, 503, 504)
-    let internal_error = LlmError::api(
-        "any_provider",
-        "Internal server error",
-        Some("500".to_string()),
-        None,
-    );
-    assert!(internal_error.is_retryable());
-
-    let bad_gateway = LlmError::api("any_provider", "Bad gateway", Some("502".to_string()), None);
-    assert!(bad_gateway.is_retryable());
-
-    let service_unavailable = LlmError::api(
-        "any_provider",
-        "Service unavailable",
-        Some("503".to_string()),
-        None,
-    );
-    assert!(service_unavailable.is_retryable());
-
-    // Test non-retryable client errors (400, 401, etc.)
-    let bad_request = LlmError::api("any_provider", "Bad request", Some("400".to_string()), None);
-    assert!(!bad_request.is_retryable());
-
-    // Test client error (should not be retryable)
-    let client_error = LlmError::api("any_provider", "Bad request", Some("400".to_string()), None);
-    assert!(!client_error.is_retryable());
 }
 
 #[test]
@@ -953,110 +834,11 @@ fn test_anthropic_response_parsing_errors() {
 // Middleware Tests
 // =============================================================================
 
-#[derive(Clone)]
-struct MockFailingProvider {
-    name: &'static str,
-    fail_count: std::sync::Arc<std::sync::Mutex<u32>>,
-    max_failures: u32,
-}
-
-impl MockFailingProvider {
-    fn new(name: &'static str, max_failures: u32) -> Self {
-        Self {
-            name,
-            fail_count: std::sync::Arc::new(std::sync::Mutex::new(0)),
-            max_failures,
-        }
-    }
-
-    fn failure_count(&self) -> u32 {
-        *self.fail_count.lock().unwrap()
-    }
-}
-
-#[async_trait::async_trait]
-impl LlmProvider for MockFailingProvider {
-    fn name(&self) -> &'static str {
-        self.name
-    }
-
-    fn aliases(&self) -> &'static [&'static str] {
-        &[]
-    }
-
-    fn default_base_url(&self) -> Option<&'static str> {
-        Some("")
-    }
-
-    fn env_key(&self) -> &'static str {
-        ""
-    }
-
-    async fn available_models(&self) -> Result<Vec<String>, LlmError> {
-        Ok(vec!["test-model".to_string()])
-    }
-
-    async fn health_check(&self) -> Result<(), LlmError> {
-        Ok(())
-    }
-}
-
-#[async_trait::async_trait]
-impl ChatCompletion for MockFailingProvider {
-    async fn chat_completion(
-        &self,
-        _request: ChatRequest,
-        model: &str,
-    ) -> Result<ChatResponse, LlmError> {
-        let mut count = self.fail_count.lock().unwrap();
-        if *count < self.max_failures {
-            *count += 1;
-            return Err(LlmError::api(
-                self.name,
-                "Simulated failure",
-                Some("500".to_string()),
-                None,
-            ));
-        }
-
-        Ok(ChatResponse {
-            message: ChatMessage {
-                role: ChatRole::Assistant,
-                content: "Success after retries".to_string(),
-            },
-            model: model.to_string(),
-            usage: TokenUsage {
-                prompt_tokens: 10,
-                completion_tokens: 5,
-                total_tokens: 15,
-            },
-            finish_reason: Some("stop".to_string()),
-            provider_metadata: None,
-        })
-    }
-
-    async fn chat_completion_stream(
-        &self,
-        _request: ChatRequest,
-        _model: &str,
-        _config: Option<StreamConfig>,
-    ) -> crate::types::StreamResult<crate::types::ChatStreamEvent> {
-        Box::pin(futures::stream::once(async {
-            Err(LlmError::model("Streaming not implemented"))
-        }))
-    }
-
-    async fn estimate_tokens(&self, text: &str, _model: &str) -> Result<u32, LlmError> {
-        Ok(text.len() as u32 / 4)
-    }
-}
-
 #[test]
 fn test_middleware_config_default() {
     let config = MiddlewareConfig::default();
 
     assert_eq!(config.timeout, Some(Duration::from_secs(30)));
-    assert!(config.retry_policy.is_some());
     assert!(config.rate_limit.is_none());
     assert!(config.enable_logging);
     assert!(!config.enable_metrics);
@@ -1071,17 +853,12 @@ fn test_middleware_config_custom() {
 
     let config = MiddlewareConfig {
         timeout: Some(Duration::from_secs(45)),
-        retry_policy: Some(RetryPolicy::Fixed { delay_ms: 1000 }),
         rate_limit: Some(rate_limit.clone()),
         enable_logging: false,
         enable_metrics: true,
     };
 
     assert_eq!(config.timeout, Some(Duration::from_secs(45)));
-    assert!(matches!(
-        config.retry_policy,
-        Some(RetryPolicy::Fixed { delay_ms: 1000 })
-    ));
     assert!(config.rate_limit.is_some());
     assert!(!config.enable_logging);
     assert!(config.enable_metrics);
@@ -1108,7 +885,6 @@ fn test_llm_service_builder_fluent_api() {
 
     let middleware_stack = LlmServiceBuilder::new()
         .timeout(Duration::from_secs(60))
-        .retry(RetryPolicy::Fixed { delay_ms: 500 })
         .rate_limit(50, Duration::from_secs(30))
         .logging()
         .metrics()
@@ -1116,10 +892,6 @@ fn test_llm_service_builder_fluent_api() {
 
     let config = middleware_stack.config();
     assert_eq!(config.timeout, Some(Duration::from_secs(60)));
-    assert!(matches!(
-        config.retry_policy,
-        Some(RetryPolicy::Fixed { delay_ms: 500 })
-    ));
     assert!(config.rate_limit.is_some());
     assert!(config.enable_logging);
     assert!(config.enable_metrics);
@@ -1133,12 +905,6 @@ fn test_llm_service_builder_fluent_api() {
 fn test_llm_service_builder_with_config() {
     let custom_config = MiddlewareConfig {
         timeout: Some(Duration::from_secs(20)),
-        retry_policy: Some(RetryPolicy::ExponentialBackoff {
-            initial_delay_ms: 200,
-            max_delay_ms: 4000,
-            multiplier: 1.5,
-            jitter: true,
-        }),
         rate_limit: None,
         enable_logging: false,
         enable_metrics: true,
@@ -1170,118 +936,6 @@ async fn test_middleware_stack_basic_call() {
     assert_eq!(response.usage.total_tokens, 15);
 }
 
-#[tokio::test]
-async fn test_middleware_retry_logic_success() {
-    // Provider that fails 2 times then succeeds
-    let provider = MockFailingProvider::new("test", 2);
-    let mut middleware_stack = LlmServiceBuilder::new()
-        .retry(RetryPolicy::Fixed { delay_ms: 10 }) // Fast retry for testing
-        .build(provider.clone(), "test-model".to_string());
-
-    let request = ChatRequestBuilder::new().user("Test retry logic").build();
-
-    let start = std::time::Instant::now();
-    let response = middleware_stack.call(request).await.unwrap();
-    let duration = start.elapsed();
-
-    assert_eq!(response.message.content, "Success after retries");
-    assert_eq!(provider.failure_count(), 2);
-    // Should have taken at least 20ms (2 retries * 10ms delay)
-    assert!(duration >= Duration::from_millis(20));
-}
-
-#[tokio::test]
-async fn test_middleware_retry_logic_failure() {
-    // Provider that always fails
-    let provider = MockFailingProvider::new("test", 10); // More failures than max retries
-    let mut middleware_stack = LlmServiceBuilder::new()
-        .retry(RetryPolicy::Fixed { delay_ms: 10 })
-        .build(provider.clone(), "test-model".to_string());
-
-    let request = ChatRequestBuilder::new()
-        .user("Test retry exhaustion")
-        .build();
-
-    let result = middleware_stack.call(request).await;
-
-    assert!(result.is_err());
-    // Should have made initial attempt + 3 retries = 4 total attempts
-    assert_eq!(provider.failure_count(), 4);
-
-    let error = result.unwrap_err();
-    assert!(matches!(error, LlmError::Api { .. }));
-}
-
-#[tokio::test]
-async fn test_middleware_exponential_backoff() {
-    let provider = MockFailingProvider::new("test", 3);
-    let mut middleware_stack = LlmServiceBuilder::new()
-        .retry(RetryPolicy::ExponentialBackoff {
-            initial_delay_ms: 10,
-            max_delay_ms: 100,
-            multiplier: 2.0,
-            jitter: false,
-        })
-        .build(provider.clone(), "test-model".to_string());
-
-    let request = ChatRequestBuilder::new()
-        .user("Test exponential backoff")
-        .build();
-
-    let start = std::time::Instant::now();
-    let response = middleware_stack.call(request).await.unwrap();
-    let duration = start.elapsed();
-
-    assert_eq!(response.message.content, "Success after retries");
-    assert_eq!(provider.failure_count(), 3);
-
-    // Should have delays of 10ms, 20ms, 40ms = 70ms minimum
-    assert!(duration >= Duration::from_millis(70));
-}
-
-#[tokio::test]
-async fn test_middleware_api_guided_retry() {
-    let provider = MockFailingProvider::new("test", 2);
-    let mut middleware_stack = LlmServiceBuilder::new()
-        .retry(RetryPolicy::ApiGuided {
-            fallback: Box::new(RetryPolicy::Fixed { delay_ms: 20 }),
-            max_api_delay_ms: 1000,
-            retry_headers: vec!["retry-after".to_string()],
-        })
-        .build(provider.clone(), "test-model".to_string());
-
-    let request = ChatRequestBuilder::new()
-        .user("Test API guided retry")
-        .build();
-
-    let response = middleware_stack.call(request).await.unwrap();
-
-    assert_eq!(response.message.content, "Success after retries");
-    assert_eq!(provider.failure_count(), 2);
-}
-
-#[tokio::test]
-async fn test_middleware_no_retry_policy() {
-    let provider = MockFailingProvider::new("test", 1);
-    let middleware_stack = LlmServiceBuilder::new()
-        // No retry policy
-        .build(provider.clone(), "test-model".to_string());
-
-    // Remove retry policy
-    let mut config = middleware_stack.config().clone();
-    config.retry_policy = None;
-    let mut middleware_stack =
-        LlmServiceBuilder::with_config(config).build(provider.clone(), "test-model".to_string());
-
-    let request = ChatRequestBuilder::new().user("Test no retry").build();
-
-    let result = middleware_stack.call(request).await;
-
-    assert!(result.is_err());
-    // Should fail immediately, no retries
-    assert_eq!(provider.failure_count(), 1);
-}
-
 #[tokio::test]
 async fn test_middleware_logging_and_metrics() {
     let provider = MockProvider::new("test");
@@ -1329,7 +983,6 @@ fn test_rate_limit_configuration() {
 async fn test_middleware_error_propagation() {
     let provider = MockProvider::new("test").with_failure();
     let mut middleware_stack = LlmServiceBuilder::new()
-        .retry(RetryPolicy::Fixed { delay_ms: 1 }) // Fast retry
         .logging()
         .build(provider, "test-model".to_string());
 
@@ -1346,12 +999,6 @@ async fn test_middleware_error_propagation() {
 fn test_middleware_config_inspection() {
     let custom_config = MiddlewareConfig {
         timeout: Some(Duration::from_secs(25)),
-        retry_policy: Some(RetryPolicy::ExponentialBackoff {
-            initial_delay_ms: 150,
-            max_delay_ms: 6000,
-            multiplier: 2.5,
-            jitter: true,
-        }),
         rate_limit: Some(RateLimit {
             requests_per_period: 75,
             period: Duration::from_secs(45),
@@ -1371,21 +1018,6 @@ fn test_middleware_config_inspection() {
     assert!(config.enable_logging);
     assert!(!config.enable_metrics);
 
-    if let Some(RetryPolicy::ExponentialBackoff {
-        initial_delay_ms,
-        max_delay_ms,
-        multiplier,
-        jitter,
-    }) = &config.retry_policy
-    {
-        assert_eq!(*initial_delay_ms, 150);
-        assert_eq!(*max_delay_ms, 6000);
-        assert_eq!(*multiplier, 2.5);
-        assert!(*jitter);
-    } else {
-        panic!("Expected ExponentialBackoff retry policy");
-    }
-
     if let Some(rate_limit) = &config.rate_limit {
         assert_eq!(rate_limit.requests_per_period, 75);
         assert_eq!(rate_limit.period, Duration::from_secs(45));