# Module 1: LLM Client

A robust, reusable interface to Ollama that all other modules will use.

**Features:**
- Clean API for chat and completion
- Retry logic with exponential backoff
- Timeout handling
- Multiple model support
- Streaming for long responses
- Usage tracking

**Why build this?** Every other module needs LLM access. A solid foundation here prevents bugs everywhere else.

## Implementation

In [1]:
import ollama
import time
from dataclasses import dataclass, field
from typing import Optional, Generator
from enum import Enum


class Model(Enum):
    """Available models with their characteristics."""
    LLAMA3 = "llama3"        # Good all-rounder
    GEMMA3 = "gemma3"        # Smaller, faster
    # Add more as you install them:
    # CODELLAMA = "codellama"  # Code-focused
    # MISTRAL = "mistral"      # Fast, good quality


@dataclass
class LLMResponse:
    """Structured response from the LLM."""
    content: str
    model: str
    elapsed_time: float
    prompt_tokens: Optional[int] = None
    completion_tokens: Optional[int] = None
    
    def __str__(self):
        return self.content


@dataclass
class UsageStats:
    """Track usage across multiple calls."""
    total_calls: int = 0
    total_time: float = 0.0
    total_prompt_tokens: int = 0
    total_completion_tokens: int = 0
    errors: int = 0
    
    def record(self, response: LLMResponse):
        self.total_calls += 1
        self.total_time += response.elapsed_time
        if response.prompt_tokens:
            self.total_prompt_tokens += response.prompt_tokens
        if response.completion_tokens:
            self.total_completion_tokens += response.completion_tokens
    
    def record_error(self):
        self.errors += 1
    
    def summary(self) -> str:
        avg_time = self.total_time / self.total_calls if self.total_calls > 0 else 0
        return (
            f"Calls: {self.total_calls} | "
            f"Errors: {self.errors} | "
            f"Total time: {self.total_time:.1f}s | "
            f"Avg time: {avg_time:.2f}s"
        )


class LLMClient:
    """Robust client for Ollama with retry logic and tracking."""
    
    def __init__(
        self,
        default_model: Model = Model.LLAMA3,
        max_retries: int = 3,
        timeout: float = 120.0,
        default_temperature: float = 0.7,
    ):
        self.default_model = default_model
        self.max_retries = max_retries
        self.timeout = timeout
        self.default_temperature = default_temperature
        self.stats = UsageStats()
    
    def chat(
        self,
        prompt: str,
        system: Optional[str] = None,
        model: Optional[Model] = None,
        temperature: Optional[float] = None,
    ) -> LLMResponse:
        """Send a chat message and get a response."""
        model = model or self.default_model
        temperature = temperature if temperature is not None else self.default_temperature
        
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        
        return self._call_with_retry(model, messages, temperature)
    
    def complete(
        self,
        prompt: str,
        model: Optional[Model] = None,
        temperature: Optional[float] = None,
    ) -> LLMResponse:
        """Simple completion without chat structure."""
        return self.chat(prompt, model=model, temperature=temperature)
    
    def stream(
        self,
        prompt: str,
        system: Optional[str] = None,
        model: Optional[Model] = None,
    ) -> Generator[str, None, None]:
        """Stream response chunks as they arrive."""
        model = model or self.default_model
        
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        
        start = time.time()
        try:
            stream = ollama.chat(
                model=model.value,
                messages=messages,
                stream=True,
            )
            full_content = ""
            for chunk in stream:
                content = chunk["message"]["content"]
                full_content += content
                yield content
            
            # Record stats after streaming completes
            elapsed = time.time() - start
            response = LLMResponse(
                content=full_content,
                model=model.value,
                elapsed_time=elapsed,
            )
            self.stats.record(response)
        except Exception as e:
            self.stats.record_error()
            raise
    
    def _call_with_retry(
        self,
        model: Model,
        messages: list,
        temperature: float,
    ) -> LLMResponse:
        """Execute call with retry logic."""
        last_error = None
        
        for attempt in range(self.max_retries):
            try:
                start = time.time()
                
                response = ollama.chat(
                    model=model.value,
                    messages=messages,
                    options={"temperature": temperature},
                )
                
                elapsed = time.time() - start
                
                result = LLMResponse(
                    content=response["message"]["content"],
                    model=model.value,
                    elapsed_time=elapsed,
                    prompt_tokens=response.get("prompt_eval_count"),
                    completion_tokens=response.get("eval_count"),
                )
                
                self.stats.record(result)
                return result
                
            except Exception as e:
                last_error = e
                self.stats.record_error()
                
                if attempt < self.max_retries - 1:
                    # Exponential backoff: 1s, 2s, 4s...
                    wait_time = 2 ** attempt
                    print(f"⚠️ Attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
        
        raise Exception(f"All {self.max_retries} attempts failed. Last error: {last_error}")
    
    def list_models(self) -> list[str]:
        """List available models."""
        response = ollama.list()
        return [m.model for m in response.models]
    
    def get_stats(self) -> str:
        """Get usage statistics."""
        return self.stats.summary()


print("✅ LLMClient class defined")

✅ LLMClient class defined


## Test: Basic Usage

In [2]:
# Create a client
llm = LLMClient(default_model=Model.LLAMA3)

# List available models
print("Available models:", llm.list_models())

Available models: ['llama3:latest', 'gemma3:latest']


In [4]:
# Simple chat
response = llm.chat("What is the capital of France? Answer in one word.")

print(f"Response: {response.content}")
print(f"Model: {response.model}")
print(f"Time: {response.elapsed_time:.2f}s")

Response: Paris.
Model: llama3
Time: 0.37s


In [5]:
# Chat with system prompt
response = llm.chat(
    prompt="Explain recursion",
    system="You are a patient teacher. Explain concepts simply using analogies. Keep responses under 100 words.",
)

print(response.content)

Recursion is like following a treasure map that leads you to the next clue, which looks exactly like the original map! You keep following the map until you reach the final treasure.

Think of it this way: you call a function (the "map") that does something, and then within that function, you call another instance of the same function (another "map"). This process repeats itself until you find the solution.


## Test: Temperature Control

In [6]:
prompt = "Invent a name for a coffee shop."

print("Low temperature (0.2) - More deterministic:")
for i in range(3):
    r = llm.chat(prompt, temperature=0.2)
    print(f"  {i+1}. {r.content.strip()}")

print("\nHigh temperature (1.2) - More creative:")
for i in range(3):
    r = llm.chat(prompt, temperature=1.2)
    print(f"  {i+1}. {r.content.strip()}")

Low temperature (0.2) - More deterministic:
  1. What a delightful task!

After some creative brainstorming, I'd like to introduce... 

**Brewed Awakening**

I hope you like it! The name plays on the idea of coffee as a morning pick-me-up (awakening) and incorporates "brewed" to highlight the shop's focus on expertly crafted cups. It also has a fun, catchy ring to it, don't you think?

What do you think? Would you visit a coffee shop called Brewed Awakening?
  2. What a delightful task!

After some creative brainstorming, I'd like to introduce... "Brewed Awakening"!

"Brewed Awakening" is a playful name that combines the idea of freshly brewed coffee with the concept of waking up and starting your day off right. It's catchy, easy to remember, and has a fun, lively vibe that would appeal to coffee lovers of all ages.

The tagline could be something like: "Fuel your morning, fuel your dreams" or "Rise and shine, brew on!"

What do you think? Would you stop by "Brewed Awakening" for a cup

## Test: Streaming

In [7]:
print("Streaming response:")
print("-" * 40)

for chunk in llm.stream("Write a haiku about programming."):
    print(chunk, end="", flush=True)

print("\n" + "-" * 40)
print("Done!")

Streaming response:
----------------------------------------
Code flows like a stream
Lines of logic, errors dwindle
Creation's dream
----------------------------------------
Done!


## Test: Usage Statistics

In [8]:
print("Usage stats so far:")
print(llm.get_stats())

Usage stats so far:
Calls: 10 | Errors: 0 | Total time: 53.7s | Avg time: 5.37s


## Test: Error Handling

Let's verify retry logic works (this will fail gracefully).

In [9]:
# Test with a non-existent model (should fail after retries)
bad_client = LLMClient(max_retries=2)

try:
    # This will fail because the model doesn't exist
    from enum import Enum
    class FakeModel(Enum):
        FAKE = "nonexistent-model-12345"
    
    bad_client.chat("Hello", model=FakeModel.FAKE)
except Exception as e:
    print(f"✅ Error handled correctly: {type(e).__name__}")
    print(f"   Stats show errors: {bad_client.stats.errors}")

⚠️ Attempt 1 failed: model 'nonexistent-model-12345' not found (status code: 404). Retrying in 1s...
✅ Error handled correctly: Exception
   Stats show errors: 2


## Export as Module

Once you're happy with this, save it as a Python file for reuse.

In [None]:
module_code = '''
"""LLM Client - Robust interface to Ollama."""

import ollama
import time
from dataclasses import dataclass
from typing import Optional, Generator
from enum import Enum


class Model(Enum):
    """Available models."""
    LLAMA3 = "llama3"
    GEMMA3 = "gemma3"


@dataclass
class LLMResponse:
    """Structured response from the LLM."""
    content: str
    model: str
    elapsed_time: float
    prompt_tokens: Optional[int] = None
    completion_tokens: Optional[int] = None
    
    def __str__(self):
        return self.content


@dataclass
class UsageStats:
    """Track usage across multiple calls."""
    total_calls: int = 0
    total_time: float = 0.0
    total_prompt_tokens: int = 0
    total_completion_tokens: int = 0
    errors: int = 0
    
    def record(self, response: LLMResponse):
        self.total_calls += 1
        self.total_time += response.elapsed_time
        if response.prompt_tokens:
            self.total_prompt_tokens += response.prompt_tokens
        if response.completion_tokens:
            self.total_completion_tokens += response.completion_tokens
    
    def record_error(self):
        self.errors += 1
    
    def summary(self) -> str:
        avg_time = self.total_time / self.total_calls if self.total_calls > 0 else 0
        return f"Calls: {self.total_calls} | Errors: {self.errors} | Total: {self.total_time:.1f}s | Avg: {avg_time:.2f}s"


class LLMClient:
    """Robust client for Ollama with retry logic and tracking."""
    
    def __init__(
        self,
        default_model: Model = Model.LLAMA3,
        max_retries: int = 3,
        timeout: float = 120.0,
        default_temperature: float = 0.7,
    ):
        self.default_model = default_model
        self.max_retries = max_retries
        self.timeout = timeout
        self.default_temperature = default_temperature
        self.stats = UsageStats()
    
    def chat(
        self,
        prompt: str,
        system: Optional[str] = None,
        model: Optional[Model] = None,
        temperature: Optional[float] = None,
    ) -> LLMResponse:
        model = model or self.default_model
        temperature = temperature if temperature is not None else self.default_temperature
        
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        
        return self._call_with_retry(model, messages, temperature)
    
    def stream(
        self,
        prompt: str,
        system: Optional[str] = None,
        model: Optional[Model] = None,
    ) -> Generator[str, None, None]:
        model = model or self.default_model
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        
        start = time.time()
        stream = ollama.chat(model=model.value, messages=messages, stream=True)
        full_content = ""
        for chunk in stream:
            content = chunk["message"]["content"]
            full_content += content
            yield content
        
        elapsed = time.time() - start
        self.stats.record(LLMResponse(full_content, model.value, elapsed))
    
    def _call_with_retry(self, model: Model, messages: list, temperature: float) -> LLMResponse:
        last_error = None
        for attempt in range(self.max_retries):
            try:
                start = time.time()
                response = ollama.chat(
                    model=model.value,
                    messages=messages,
                    options={"temperature": temperature},
                )
                elapsed = time.time() - start
                result = LLMResponse(
                    content=response["message"]["content"],
                    model=model.value,
                    elapsed_time=elapsed,
                    prompt_tokens=response.get("prompt_eval_count"),
                    completion_tokens=response.get("eval_count"),
                )
                self.stats.record(result)
                return result
            except Exception as e:
                last_error = e
                self.stats.record_error()
                if attempt < self.max_retries - 1:
                    time.sleep(2 ** attempt)
        raise Exception(f"All {self.max_retries} attempts failed: {last_error}")
    
    def list_models(self) -> list[str]:
        return [m.model for m in ollama.list().models]
    
    def get_stats(self) -> str:
        return self.stats.summary()
'''

# Save to src folder
with open('/home/developer/projects/sandbox-experiments/src/llm_client.py', 'w') as f:
    f.write(module_code.strip())

print("✅ Saved to src/llm_client.py")
print("\nUsage in other notebooks:")
print("  from src.llm_client import LLMClient, Model")

## Next Steps

Now that you have a solid LLM client, you can:

1. **Build Module 2 (Prompt Library)** - Store and version your prompts
2. **Build Module 3 (Web Fetcher)** - Fetch and clean web content
3. **Use this client** in the prompt engineering playground

The `LLMClient` class is now your standard way to interact with Ollama across all notebooks.