<a href="https://colab.research.google.com/github/gitleon8301/MY-AI-Gizmo-working/blob/main/Colab-TextGen-GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# oobabooga/text-generation-webui

After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.

* Project page: https://github.com/oobabooga/text-generation-webui
* Gradio server status: https://status.gradio.app/

In [None]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
# SIMPLE LLAMA CHAT - NO ERRORS - CPU ONLY
import os
import subprocess
from pathlib import Path

# ---------- MODEL CHOICE ----------
# ü¶ô OPTION 1: Llama 3.2 3B (DEFAULT - FAST)
MODEL_REPO = "bartowski/Llama-3.2-3B-Instruct-GGUF"
SPECIFIC_FILE = "Llama-3.2-3B-Instruct-Q5_K_M.gguf"

# ü¶ô OPTION 2: Llama 3.1 8B (BETTER QUALITY)
# MODEL_REPO = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
# SPECIFIC_FILE = "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"

# üíª OPTION 3: Qwen2.5-Coder 7B (GOOD AT CODING)
# MODEL_REPO = "bartowski/Qwen2.5-Coder-7B-Instruct-GGUF"
# SPECIFIC_FILE = "Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf"

# üöÄ OPTION 4: Phi-3 Mini (VERY FAST)
# MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
# SPECIFIC_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
# ------------------------------------

# Setup
models_dir = Path("/content/models") / MODEL_REPO.replace("/", "_")
models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / SPECIFIC_FILE

# Force CPU
os.environ["CUDA_VISIBLE_DEVICES"] = ""

def run_cmd(cmd):
    """Run command and show output"""
    print(f"\n>>> {cmd}\n")
    result = subprocess.run(cmd, shell=True, capture_output=False, text=True)
    return result.returncode

print("=" * 70)
print("ü§ñ SIMPLE LLAMA CHAT - CPU MODE")
print("=" * 70)
print(f"üì¶ Model: {MODEL_REPO}")
print(f"üìÑ File: {SPECIFIC_FILE}")
print("=" * 70)

# 1) Download model if needed
if model_path.exists() and model_path.stat().st_size > 1_000_000:
    print(f"\n‚úì Model already exists: {model_path}")
else:
    print("\nüì• Downloading model...")
    run_cmd("pip install -q huggingface_hub")

    try:
        from huggingface_hub import hf_hub_download
        print(f"Downloading {SPECIFIC_FILE}...")
        downloaded = hf_hub_download(
            repo_id=MODEL_REPO,
            filename=SPECIFIC_FILE,
            local_dir=str(models_dir),
            resume_download=True
        )
        print(f"‚úì Downloaded to: {downloaded}")
    except Exception as e:
        print(f"‚ùå Download failed: {e}")
        print(f"\nManual download:")
        print(f"1. Go to: https://huggingface.co/{MODEL_REPO}/tree/main")
        print(f"2. Download: {SPECIFIC_FILE}")
        print(f"3. Upload to: {models_dir}")
        raise SystemExit(1)

# 2) Install llama-cpp-python (CPU version)
print("\nüì¶ Installing llama-cpp-python (CPU)...")
run_cmd("pip install -q llama-cpp-python")

# 3) Install Gradio for web interface
print("\nüì¶ Installing Gradio...")
run_cmd("pip install -q gradio")

# 4) Create and run the chat interface
print("\nüöÄ Starting chat interface...")
print("=" * 70)

# Create the Python script for the chat interface
chat_script = f'''
import gradio as gr
from llama_cpp import Llama

print("Loading model... (this may take 1-2 minutes)")
llm = Llama(
    model_path="{model_path}",
    n_ctx=4096,          # Context window
    n_threads={os.cpu_count() or 2},  # Use all CPU cores
    n_gpu_layers=0,      # CPU only
    verbose=False
)
print("‚úì Model loaded!")

def chat(message, history):
    """Generate response"""
    # Build conversation
    conversation = []
    for h in history:
        conversation.append({{"role": "user", "content": h[0]}})
        conversation.append({{"role": "assistant", "content": h[1]}})
    conversation.append({{"role": "user", "content": message}})

    # Generate
    response = llm.create_chat_completion(
        messages=conversation,
        max_tokens=1024,
        temperature=0.7,
        top_p=0.9,
        stream=True
    )

    # Stream output
    partial = ""
    for chunk in response:
        if "choices" in chunk:
            delta = chunk["choices"][0].get("delta", {{}})
            if "content" in delta:
                partial += delta["content"]
                yield partial

# Create interface
demo = gr.ChatInterface(
    fn=chat,
    title="ü¶ô Llama Chat (CPU Mode)",
    description=f"Model: {MODEL_REPO} | Running on CPU",
    examples=[
        "What is the capital of France?",
        "Write a Python function to calculate fibonacci numbers",
        "Explain quantum computing in simple terms"
    ],
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear"
)

# Launch with public URL
demo.launch(share=True, server_name="0.0.0.0")
'''

# Save the script
script_path = Path("/content/chat_app.py")
script_path.write_text(chat_script)

# Run it
print("\nüåê Starting server... Look for the public URL below!\n")
print("=" * 70)
os.system(f"python {script_path}")