# Model Caching Strategies for GPT-OSS

This notebook demonstrates different ways to keep the GPT-OSS model loaded and responsive.


In [None]:
import requests
import json
import time
from datetime import datetime

# Configuration
OLLAMA_BASE_URL = "http://localhost:11434"
MODEL_NAME = "gpt-oss:20b"


In [None]:
# Method 1: Check current model status
def check_model_status():
    """Check if model is currently loaded"""
    try:
        response = requests.get(f"{OLLAMA_BASE_URL}/api/ps")
        if response.status_code == 200:
            models = response.json()
            print("Currently loaded models:")
            if models.get('models'):
                for model in models['models']:
                    print(f"  - {model['name']} (Size: {model.get('size_vram', 'Unknown')})")
                    print(f"    Until: {model.get('expires_at', 'Unknown')}")
            else:
                print("  No models currently loaded")
        else:
            print("Could not check model status")
    except Exception as e:
        print(f"Error checking model status: {e}")

check_model_status()


In [None]:
# Method 2: Preload model with keep-alive setting
def preload_model(keep_alive="30m"):
    """Preload the model and set keep-alive duration"""
    print(f"Preloading {MODEL_NAME} with keep-alive: {keep_alive}")
    
    payload = {
        "model": MODEL_NAME,
        "keep_alive": keep_alive,  # Keep model loaded for 30 minutes
        "messages": [
            {"role": "user", "content": "Hello, please confirm you're ready."}
        ],
        "stream": False
    }
    
    start_time = time.time()
    
    try:
        response = requests.post(f"{OLLAMA_BASE_URL}/api/chat", json=payload, timeout=120)
        load_time = time.time() - start_time
        
        if response.status_code == 200:
            result = response.json()
            print(f"✅ Model loaded successfully in {load_time:.2f}s")
            print(f"Response: {result['message']['content'][:100]}...")
            print(f"Model will stay loaded until: {datetime.now().strftime('%H:%M')} + {keep_alive}")
        else:
            print(f"❌ Failed to load model: {response.status_code}")
            
    except Exception as e:
        print(f"Error preloading model: {e}")

# Preload with 30 minute keep-alive
preload_model("30m")
