### Cell 1 - Install Ollama and verify environment

Installs Ollama for local model serving, sets up environment variables, and verifies the installation.

In [None]:
!pip -q install ollama requests --disable-pip-version-check

import os, subprocess, time, json, requests
from pathlib import Path

os.environ['OLLAMA_HOST'] = os.getenv('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434')
MODEL_NAME = "qwen2.5:1.5b"
MLFLOW_URI = os.getenv("MLFLOW_TRACKING_URI", "http://ai-starter-kit-mlflow:5000")

OLLAMA_HOST = os.environ['OLLAMA_HOST']

print("Environment Configuration:")
print("Ollama Host:", OLLAMA_HOST)
print("Model:      ", MODEL_NAME)
print("MLflow:     ", MLFLOW_URI)
print("-" * 60)

try:
    r = requests.get(f"{OLLAMA_HOST}/api/version", timeout=5)
    print("Ollama version:", r.json())
except Exception as e:
    print("Note: Ollama service not running. Starting it in next cell...")

### Cell 2 - Start Ollama service and pull model

Starts the Ollama service if not running, pulls the Qwen 2.5 1.5B model, and verifies it's ready.

In [None]:
import subprocess, time, requests, os

OLLAMA_HOST = os.environ.get('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434')
MODEL_NAME = "qwen2.5:1.5b"

def check_ollama():
    try:
        r = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=2)
        return r.status_code == 200
    except:
        return False

if not check_ollama() and OLLAMA_HOST.startswith("http://ai-starter-kit-ollama"):
    print("Starting Ollama service...")
    try:
        subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        time.sleep(3)
    except Exception as e:
        print(f"Could not start Ollama automatically: {e}")
        print("Please start Ollama manually with: ollama serve")

if check_ollama():
    print("Ollama service is running")
    
    print(f"\nPulling model {MODEL_NAME}...")
    try:
        r = requests.get(f"{OLLAMA_HOST}/api/tags")
        models = r.json().get('models', [])
        model_exists = any(m.get('name') == MODEL_NAME for m in models)
        
        if not model_exists:
            pull_data = {"name": MODEL_NAME}
            r = requests.post(f"{OLLAMA_HOST}/api/pull", json=pull_data, stream=True)
            for line in r.iter_lines():
                if line:
                    try:
                        status = json.loads(line)
                        if 'status' in status:
                            print(f"  {status['status']}", end='\r')
                    except:
                        pass
            print(f"\nModel {MODEL_NAME} pulled successfully")
        else:
            print(f"Model {MODEL_NAME} already available")
    except Exception as e:
        print(f"Error pulling model: {e}")
else:
    print("Warning: Ollama service is not running")
    print("Please ensure Ollama is installed and running")

### Cell 3 - Create OpenAI-compatible API wrapper

Sets up a simple FastAPI server that wraps Ollama with an OpenAI-compatible API, including MLflow tracking.

In [None]:
!pip -q install fastapi uvicorn mlflow --disable-pip-version-check

import os, subprocess, time, json, requests, threading
from pathlib import Path

api_wrapper_code = '''
import os, time, uuid, requests, json
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import uvicorn

USE_MLFLOW = False
try:
    import mlflow
    mlflow_uri = os.getenv("MLFLOW_TRACKING_URI")
    if mlflow_uri:
        mlflow.set_tracking_uri(mlflow_uri)
        mlflow.set_experiment("ollama-llm")
        USE_MLFLOW = True
except:
    pass

app = FastAPI()
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ai-starter-kit-ollama:11434")
MODEL_NAME = os.getenv("MODEL_NAME", "qwen2.5:1.5b")

@app.get("/v1/healthz")
async def health():
    return {"status": "ok", "model": MODEL_NAME}

@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    t0 = time.time()
    body = await request.json()
    
    messages = body.get("messages", [])
    temperature = body.get("temperature", 0.7)
    max_tokens = body.get("max_tokens", 256)
    
    # Call Ollama API
    ollama_payload = {
        "model": MODEL_NAME,
        "messages": messages,
        "stream": False,
        "options": {
            "temperature": temperature,
            "num_predict": max_tokens
        }
    }
    
    try:
        r = requests.post(f"{OLLAMA_HOST}/api/chat", json=ollama_payload, timeout=120)
        r.raise_for_status()
        ollama_response = r.json()
        
        content = ollama_response.get("message", {}).get("content", "")
        prompt_tokens = len(" ".join(m.get("content", "") for m in messages).split())
        completion_tokens = len(content.split())
        
        if USE_MLFLOW:
            try:
                with mlflow.start_run():
                    mlflow.log_params({
                        "temperature": temperature,
                        "max_tokens": max_tokens,
                        "model": MODEL_NAME
                    })
                    mlflow.log_metrics({
                        "duration_ms": int((time.time() - t0) * 1000),
                        "prompt_tokens_approx": prompt_tokens,
                        "completion_tokens_approx": completion_tokens,
                        "total_tokens_approx": prompt_tokens + completion_tokens
                    })
            except:
                pass
        
        return {
            "id": "chatcmpl-" + uuid.uuid4().hex[:8],
            "object": "chat.completion",
            "created": int(time.time()),
            "model": MODEL_NAME,
            "choices": [{
                "index": 0,
                "message": {"role": "assistant", "content": content},
                "finish_reason": "stop"
            }],
            "usage": {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": prompt_tokens + completion_tokens
            }
        }
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
'''

with open('/tmp/ollama_wrapper.py', 'w') as f:
    f.write(api_wrapper_code)

!pkill -f ollama_wrapper.py 2>/dev/null || true

env_vars = f"""
export OLLAMA_HOST="{os.getenv('OLLAMA_HOST', 'http://ai-starter-kit-ollama:11434')}"
export MODEL_NAME="qwen2.5:1.5b"
export MLFLOW_TRACKING_URI="{os.getenv('MLFLOW_TRACKING_URI', 'http://ai-starter-kit-mlflow:5000')}"
"""

!echo '{env_vars}' > /tmp/env_vars.sh
!bash -c 'source /tmp/env_vars.sh && nohup python /tmp/ollama_wrapper.py > /tmp/wrapper.log 2>&1 &'

print("Starting API wrapper...")
for i in range(30):
    time.sleep(1)
    try:
        r = requests.get("http://localhost:8000/v1/healthz", timeout=1)
        if r.status_code == 200:
            print("API Status:", r.json())
            print(f"\nOpenAI-compatible API running at: http://localhost:8000/v1")
            print(f"Health: http://localhost:8000/v1/healthz")
            print(f"Chat:   http://localhost:8000/v1/chat/completions")
            break
    except:
        if i % 5 == 0:
            print(f"  Waiting for API to start... ({i}s)")
        continue
else:
    print("\nAPI wrapper failed to start. Checking logs:")
    !tail -20 /tmp/wrapper.log
    print("\nYou can still use direct Ollama API in the next cells.")

### Cell 4 - Basic client + latency test

Tests the OpenAI-compatible API with a simple chat request and measures latency.

In [None]:
import os, time, requests, json

USE_WRAPPER = True
BASE_URL = "http://localhost:8000/v1" if USE_WRAPPER else os.getenv("OLLAMA_HOST", "http://ai-starter-kit-ollama:11434")

def health():
    if USE_WRAPPER:
        r = requests.get(f"{BASE_URL}/healthz", timeout=10)
        print("Health:", r.status_code, r.json())
    else:
        r = requests.get(f"{BASE_URL}/api/tags", timeout=10)
        print("Health:", r.status_code, "Models available:", len(r.json().get('models', [])))

def chat(prompt, temperature=0.4, max_tokens=220):
    if USE_WRAPPER:
        body = {
            "model": "qwen2.5:1.5b",
            "temperature": temperature,
            "max_tokens": max_tokens,
            "messages": [
                {"role": "system", "content": "You are a helpful assistant. Be concise."},
                {"role": "user", "content": prompt},
            ]
        }
        endpoint = f"{BASE_URL}/chat/completions"
    else:
        body = {
            "model": "qwen2.5:1.5b",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant. Be concise."},
                {"role": "user", "content": prompt},
            ],
            "stream": False,
            "options": {
                "temperature": temperature,
                "num_predict": max_tokens
            }
        }
        endpoint = f"{BASE_URL}/api/chat"
    
    t0 = time.time()
    r = requests.post(endpoint, json=body, timeout=120)
    dt = time.time() - t0
    r.raise_for_status()
    
    if USE_WRAPPER:
        response = r.json()
        content = response["choices"][0]["message"]["content"]
        usage = response.get("usage", {})
    else:
        response = r.json()
        content = response.get("message", {}).get("content", "")
        usage = {"total_tokens": "estimated: " + str(len(content.split()) + len(prompt.split()))}
    
    print(f"\nLatency: {dt:.2f}s  | usage: {usage}")
    print("\n---\n", content)
    return content

health()
_ = chat("Say 'test ok' then give me one short fun fact about llamas.")

### Cell 5 - Multi-agent pipeline

Implements a simple three-agent workflow (Researcher -> Writer -> Critic) using the local LLM.

In [None]:
import os, requests, json, time

BASE_URL = "http://localhost:8000/v1"  
OLLAMA_DIRECT = os.getenv("OLLAMA_HOST", "http://ai-starter-kit-ollama:11434")

def call_llm(role_prompt, user_message, temperature=0.4, max_tokens=150, use_wrapper=True):
    if use_wrapper:
        body = {
            "model": "qwen2.5:1.5b",
            "temperature": temperature,
            "max_tokens": max_tokens,
            "messages": [
                {"role": "system", "content": role_prompt},
                {"role": "user", "content": user_message}
            ]
        }
        try:
            r = requests.post(f"{BASE_URL}/chat/completions", json=body, timeout=120)
            r.raise_for_status()
            return r.json()["choices"][0]["message"]["content"]
        except Exception as e:
            return f"Error: {e}"
    else:
        body = {
            "model": "qwen2.5:1.5b",
            "messages": [
                {"role": "system", "content": role_prompt},
                {"role": "user", "content": user_message}
            ],
            "stream": False,
            "options": {
                "temperature": temperature,
                "num_predict": max_tokens
            }
        }
        try:
            r = requests.post(f"{OLLAMA_DIRECT}/api/chat", json=body, timeout=120)
            r.raise_for_status()
            return r.json().get("message", {}).get("content", "")
        except Exception as e:
            return f"Error: {e}"

print("=" * 60)
print("Running Multi-Agent Workflow with Ollama")
print("=" * 60)

task = "Research the latest advancements in quantum computing as of 2025."

try:
    r = requests.get(f"{BASE_URL}/healthz", timeout=2)
    use_wrapper = r.status_code == 200
    print("Using: OpenAI-compatible wrapper\n")
except:
    use_wrapper = False
    print("Using: Direct Ollama API\n")

print("1. RESEARCHER:")
print("-" * 40)
research_prompt = "You are a researcher. Provide 3-4 key facts about the topic. Be concise and factual."
research_notes = call_llm(research_prompt, task, temperature=0.35, max_tokens=140, use_wrapper=use_wrapper)
print(research_notes)
time.sleep(1)

print("\n2. WRITER:")
print("-" * 40)
writer_prompt = "You are a technical writer. Based on the following notes, write a brief report."
writer_task = f"Write a report based on these notes:\n{research_notes}"
report = call_llm(writer_prompt, writer_task, temperature=0.55, max_tokens=220, use_wrapper=use_wrapper)
print(report)
time.sleep(1)

print("\n3. CRITIC/EDITOR:")
print("-" * 40)
critic_prompt = "You are an editor. Review the report and provide a final polished version."
critic_task = f"Review and improve this report:\n{report}"
final_output = call_llm(critic_prompt, critic_task, temperature=0.45, max_tokens=160, use_wrapper=use_wrapper)
print(final_output)

print("\n" + "=" * 60)
print("Multi-agent workflow complete")
print("=" * 60)

### Cell 6 - MLFlow: connect to tracking server and list recent runs

Connects to MLflow tracking server and displays recent model inference runs with metrics.

In [None]:
!pip -q install mlflow==2.14.3 --disable-pip-version-check

import os, mlflow
from datetime import datetime

tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "http://ai-starter-kit-mlflow:5000")
mlflow.set_tracking_uri(tracking_uri)
print(f"MLflow Tracking URI: {tracking_uri}")

exp_name = "ollama-llm"
exp = mlflow.set_experiment(exp_name)
print(f"Experiment: {exp.name} (ID: {exp.experiment_id})")
print("-" * 60)

client = mlflow.tracking.MlflowClient()
runs = client.search_runs(
    exp.experiment_id,
    order_by=["attributes.start_time DESC"],
    max_results=10
)

if not runs:
    print("No runs found. Run cells 4 or 5 first to generate inference requests.")
else:
    print(f"\nFound {len(runs)} recent runs:")
    print("-" * 60)
    
    for i, run in enumerate(runs, 1):
        start_time = datetime.fromtimestamp(run.info.start_time/1000).strftime('%Y-%m-%d %H:%M:%S')
        duration = run.data.metrics.get('duration_ms', 'N/A')
        temp = run.data.params.get('temperature', 'N/A')
        max_tokens = run.data.params.get('max_tokens', 'N/A')
        total_tokens = run.data.metrics.get('total_tokens_approx', 'N/A')
        
        print(f"\nRun {i}:")
        print(f"  ID:          {run.info.run_id[:12]}...")
        print(f"  Time:        {start_time}")
        print(f"  Status:      {run.info.status}")
        print(f"  Temperature: {temp}")
        print(f"  Max Tokens:  {max_tokens}")
        print(f"  Duration:    {duration} ms")
        print(f"  Total Tokens: {total_tokens}")
    
    print("\n" + "=" * 60)
    print("SUMMARY:")
    successful = sum(1 for r in runs if r.info.status == 'FINISHED')
    durations = [r.data.metrics.get('duration_ms', 0) for r in runs if r.data.metrics.get('duration_ms')]
    avg_duration = sum(durations) / len(durations) if durations else 0
    
    print(f"  Total Runs: {len(runs)}")
    print(f"  Successful: {successful}")
    print(f"  Failed: {len(runs) - successful}")
    print(f"  Avg Duration: {avg_duration:.1f} ms" if avg_duration else "  Avg Duration: N/A")

print("\n" + "=" * 60)
print("MLflow verification complete")