# Agentic Framework — Fully Automatic Google Colab Deployment

**One-click deployment**: Just click **Runtime → Run all** (or `Ctrl+F9`) and everything will start automatically.

### What this does
1. Verifies GPU (H100/A100) and system resources
2. Installs system dependencies (PostgreSQL, Redis, Node.js 22, MinIO)
3. Installs Ollama + pulls DeepSeek R1 14B (GPU-accelerated)
4. Clones the repo and installs Python packages
5. Starts all infrastructure (PostgreSQL, Redis, ChromaDB, MinIO)
6. Starts all 5 microservices + dashboard
7. Creates ngrok tunnels for external access
8. Runs health checks
9. Keeps the session alive so Colab doesn't disconnect

### Prerequisites
- Google Colab **Pro** account (for GPU access)
- Runtime set to **GPU** (Runtime → Change runtime type → T4/A100/H100)

---

In [1]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  CONFIGURATION — Edit these before running                  ║
# ╚══════════════════════════════════════════════════════════════╝

# GitHub repo to clone
REPO_URL = "https://github.com/landonking-gif/ai_final.git"
# ╔══════════════════════════════════════════════════════════════╗
# ║  PULL LATEST FIXES FROM GITHUB                              ║
# ╚══════════════════════════════════════════════════════════════╝

!cd /content && rm -rf ai_final
!git clone https://github.com/landonking-gif/ai_final.git
!cd ai_final && ls -la

print("✅ Latest fixes pulled from GitHub!")

# (Optional) Set your ngrok auth token for stable URLs
# Get one free at https://dashboard.ngrok.com/signup
NGROK_AUTH_TOKEN = "39MaIP07IiJMHPNDgd3raMEOL6r_2KyacFVXP68bbxBu9s8E8"

# LLM model to use (pulled via Ollama)
PRIMARY_MODEL = "deepseek-r1:14b"
FALLBACK_MODEL = "llama3.2:3b"

# Whether to start the React dashboard (adds ~30s startup)
START_DASHBOARD = True

# Whether to create ngrok tunnel for external access
ENABLE_NGROK = True

print("Configuration loaded. Running full deployment...")

Cloning into 'ai_final'...
remote: Enumerating objects: 60777, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (130/130), done.[K
remote: Total 60777 (delta 53), reused 124 (delta 30), pack-reused 60615 (from 1)[K
Receiving objects: 100% (60777/60777), 60.30 MiB | 17.27 MiB/s, done.
Resolving deltas: 100% (22791/22791), done.
Updating files: 100% (79496/79496), done.
total 312
drwxr-xr-x 10 root root  4096 Feb  8 02:06 .
drwxr-xr-x  1 root root  4096 Feb  8 02:05 ..
-rw-r--r--  1 root root  4812 Feb  8 02:06 agentic-framework-deploy-auto.ipynb
drwxr-xr-x 17 root root  4096 Feb  8 02:06 agentic-framework-main
-rw-r--r--  1 root root  1105 Feb  8 02:06 check_deployment_status.ps1
-rw-r--r--  1 root root  8252 Feb  8 02:06 colab_automated_deploy.py
-rw-r--r--  1 root root 58478 Feb  8 02:06 colab_auto_run.ipynb
-rw-r--r--  1 root root  2771 Feb  8 02:06 colab_critical_diagnostic.py
-rw-r--r--  1 root root 34957 Feb  8 02:06 colab_deploy.ipyn

In [2]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 1: System Check & Dependencies                      ║
# ╚══════════════════════════════════════════════════════════════╝
import subprocess, os, sys, shutil, time

def run_cmd(cmd, desc="", check=False):
    """Run a shell command with status output."""
    if desc:
        print(f"  [{desc}]", end=" ", flush=True)
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if desc:
        print("OK" if result.returncode == 0 else f"WARN ({result.stderr[:120]})")
    if check and result.returncode != 0:
        raise RuntimeError(f"{desc} failed: {result.stderr[:300]}")
    return result

print("=" * 60)
print("PHASE 1: SYSTEM CHECK & DEPENDENCY INSTALL")
print("=" * 60)

# --- GPU Check ---
gpu_check = subprocess.run(
    ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", "--format=csv,noheader"],
    capture_output=True, text=True
)
if gpu_check.returncode == 0:
    print(f"  [GPU] {gpu_check.stdout.strip()}")
else:
    print("  [GPU] No GPU detected — LLM inference will be slow on CPU!")
    print("         Go to Runtime > Change runtime type > GPU")

# --- RAM & Disk ---
try:
    import psutil
    ram_gb = psutil.virtual_memory().total / (1024**3)
    print(f"  [RAM] {ram_gb:.1f} GB")
except ImportError:
    pass
disk = shutil.disk_usage("/")
print(f"  [Disk] {disk.free / (1024**3):.1f} GB free")
print(f"  [Python] {sys.version.split()[0]}")

# --- Install System Dependencies ---
print("\n  Installing system packages...")
run_cmd("apt-get update -qq 2>/dev/null", "apt update")
run_cmd("apt-get install -y -qq postgresql postgresql-client redis-server build-essential libpq-dev zstd > /dev/null 2>&1", "PostgreSQL + Redis + build tools + zstd")

# Node.js 22
run_cmd("curl -fsSL https://deb.nodesource.com/setup_22.x | bash - > /dev/null 2>&1", "Node.js 22 repo")
run_cmd("apt-get install -y -qq nodejs > /dev/null 2>&1", "Node.js 22")

# MinIO binary
run_cmd("wget -q https://dl.min.io/server/minio/release/linux-amd64/minio -O /usr/local/bin/minio && chmod +x /usr/local/bin/minio", "MinIO")

node_ver = subprocess.run("node --version", shell=True, capture_output=True, text=True)
print(f"  [Node.js] {node_ver.stdout.strip()}")
print("\n  Phase 1 complete.")
print("=" * 60)


PHASE 1: SYSTEM CHECK & DEPENDENCY INSTALL
  [GPU] Tesla T4, 15360 MiB, 550.54.15
  [RAM] 12.7 GB
  [Disk] 178.7 GB free
  [Python] 3.12.12

  Installing system packages...
  [apt update] OK
  [PostgreSQL + Redis + build tools + zstd] OK
  [Node.js 22 repo] OK
  [Node.js 22] OK
  [MinIO] OK
  [Node.js] v22.22.0

  Phase 1 complete.


In [3]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 2: Ollama + LLM Models (GPU-Accelerated)            ║
# ╚══════════════════════════════════════════════════════════════╝
import subprocess, os, time

print("=" * 60)
print("PHASE 2: OLLAMA + LLM MODEL SETUP")
print("=" * 60)

# Install Ollama
print("  Installing Ollama...", end=" ", flush=True)

# Download the install script
subprocess.run("wget -q https://ollama.com/install.sh -O /tmp/ollama_install.sh", shell=True, check=True)
subprocess.run("chmod +x /tmp/ollama_install.sh", shell=True, check=True)

# Run the install script with sudo, capturing output
install_command = "sudo /tmp/ollama_install.sh"
install_process = subprocess.Popen(install_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = install_process.communicate()

if install_process.returncode == 0:
    print("OK")
else:
    print(f"WARN: Ollama installation script returned non-zero exit code ({install_process.returncode}).")
    print(f"Installation STDOUT:\n{stdout}")
    print(f"Installation STDERR:\n{stderr}")

# Verify Ollama executable exists
OLLAMA_BIN_PATH = "/usr/local/bin/ollama"
if not os.path.exists(OLLAMA_BIN_PATH):
    print(f"  [ERROR] Ollama executable not found at {OLLAMA_BIN_PATH}. Installation might have failed or installed elsewhere.")
    print("  Attempting to locate ollama binary...")
    find_ollama_result = subprocess.run("find / -name ollama 2>/dev/null", shell=True, capture_output=True, text=True)
    found_paths = find_ollama_result.stdout.strip().split('\n')
    if found_paths and found_paths[0]: # If anything was found
        print(f"  Found ollama at: {found_paths[0]}. Please check this path.")
    else:
        print("  Ollama not found anywhere on the system after installation attempt.")
    raise FileNotFoundError(f"Ollama executable not found at {OLLAMA_BIN_PATH}")

# Start Ollama server in background
print("  Starting Ollama server...", end=" ", flush=True)
os.environ["OLLAMA_HOST"] = "0.0.0.0:11434"
subprocess.Popen(
    [OLLAMA_BIN_PATH, "serve"],
    stdout=open("/tmp/ollama.log", "w"),
    stderr=subprocess.STDOUT,
    env={**os.environ, "OLLAMA_HOST": "0.0.0.0:11434"}
)
time.sleep(5)
print("OK")

# Pull primary model
print(f"  Pulling {PRIMARY_MODEL} (this may take 2-8 min)...")
subprocess.run([OLLAMA_BIN_PATH, "pull", PRIMARY_MODEL], capture_output=False, text=True)

# Pull fallback model
print(f"  Pulling {FALLBACK_MODEL}...")
subprocess.run([OLLAMA_BIN_PATH, "pull", FALLBACK_MODEL], capture_output=False, text=True)

# Verify
print("\n  Available models:")
subprocess.run([OLLAMA_BIN_PATH, "list"], capture_output=False, text=True)

print("\n  Phase 2 complete.")
print("=" * 60)


PHASE 2: OLLAMA + LLM MODEL SETUP
  Installing Ollama... OK
  Starting Ollama server... OK
  Pulling deepseek-r1:14b (this may take 2-8 min)...
  Pulling llama3.2:3b...

  Available models:

  Phase 2 complete.


In [4]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 3: Clone Repo + Install Python Packages             ║
# ╚══════════════════════════════════════════════════════════════╝
import subprocess, os, sys

print("=" * 60)
print("PHASE 3: REPO CLONE & PYTHON DEPENDENCIES")
print("=" * 60)

INSTALL_DIR = "/content/ai_final"
FRAMEWORK_DIR = f"{INSTALL_DIR}/agentic-framework-main"

# Clone or update
if os.path.exists(INSTALL_DIR):
    print("  Repo exists — pulling latest...")
    subprocess.run(["git", "-C", INSTALL_DIR, "pull"], capture_output=False, text=True)
else:
    print(f"  Cloning {REPO_URL}...")
    subprocess.run(["git", "clone", REPO_URL, INSTALL_DIR], capture_output=False, text=True)

os.chdir(FRAMEWORK_DIR)

# Create symlinks (hyphenated dirs → underscored for Python imports)
symlinks = {
    "memory_service": "memory-service",
    "subagent_manager": "subagent-manager",
    "mcp_gateway": "mcp-gateway",
    "code_exec": "code-exec",
}
for link_name, target in symlinks.items():
    if not os.path.exists(link_name) and os.path.exists(target):
        os.symlink(target, link_name)
        print(f"  Symlink: {link_name} -> {target}")

# Install Python dependencies
print("\n  Installing Python packages (2-3 min)...")
subprocess.run(
    [sys.executable, "-m", "pip", "install", "-q",
     "-r", f"{FRAMEWORK_DIR}/requirements.txt"],
    capture_output=False, text=True
)

# Extra packages for Colab
subprocess.run(
    [sys.executable, "-m", "pip", "install", "-q",
     "pyngrok", "asyncpg", "aiofiles", "psutil"],
    capture_output=False, text=True
)

# Install OpenClaw
print("  Installing OpenClaw...")
subprocess.run(["npm", "install", "-g", "openclaw@latest"],
               capture_output=True, text=True)

# Add framework to PYTHONPATH
if FRAMEWORK_DIR not in sys.path:
    sys.path.insert(0, FRAMEWORK_DIR)
os.environ["PYTHONPATH"] = FRAMEWORK_DIR

print(f"\n  Framework directory: {FRAMEWORK_DIR}")
print("  Phase 3 complete.")
print("=" * 60)

PHASE 3: REPO CLONE & PYTHON DEPENDENCIES
  Repo exists — pulling latest...
  Symlink: memory_service -> memory-service
  Symlink: subagent_manager -> subagent-manager
  Symlink: mcp_gateway -> mcp-gateway
  Symlink: code_exec -> code-exec

  Installing Python packages (2-3 min)...
  Installing OpenClaw...

  Framework directory: /content/ai_final/agentic-framework-main
  Phase 3 complete.


In [10]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 4: Start Infrastructure + All Services               ║
# ╚══════════════════════════════════════════════════════════════╝
import subprocess, os, sys, time, urllib.request, json, socket

# ─── CONFIGURATION ───
# Redefine here to ensure self-contained recovery
PRIMARY_MODEL = "deepseek-r1:14b"
FALLBACK_MODEL = "llama3.2:3b"
START_DASHBOARD = True
ENABLE_NGROK = True

# ─── AUTO-REPAIR: RESTORE REPO IF MISSING ───
FRAMEWORK_DIR = "/content/ai_final/agentic-framework-main"
if not os.path.exists(FRAMEWORK_DIR):
    print(f"⚠️ Framework directory not found at {FRAMEWORK_DIR}")
    print("   Attempting to re-clone repository...")
    subprocess.run("rm -rf /content/ai_final", shell=True)
    subprocess.run("git clone https://github.com/landonking-gif/ai_final.git /content/ai_final", shell=True, check=True)
    print("✅ Repository cloned.")
    # Ensure dependencies are installed
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", f"{FRAMEWORK_DIR}/requirements.txt"])
    print("✅ Dependencies verified.")

os.chdir(FRAMEWORK_DIR)

# ─── CRITICAL FIX: RECREATE SYMLINKS ───
# Python cannot import modules with hyphens, so we create underscore aliases
print("\n  Verifying package symlinks...")
symlinks = {
    "memory_service": "memory-service",
    "subagent_manager": "subagent-manager",
    "mcp_gateway": "mcp-gateway",
    "code_exec": "code-exec",
}
for link_name, target in symlinks.items():
    if not os.path.exists(link_name) and os.path.exists(target):
        os.symlink(target, link_name)
        print(f"  Created symlink: {link_name} -> {target}")
    elif not os.path.exists(target):
         print(f"  ⚠️ Target missing: {target}")
print("  Symlinks verified.")

# ─── AUTO-REPAIR: SYSTEM DEPENDENCIES ───
if not os.path.exists("/usr/local/bin/minio"):
    print("⚠️ System dependencies missing (MinIO/Redis/Postgres). Re-installing...")
    print("   Updating apt...", end=" ", flush=True)
    subprocess.run("apt-get update -qq", shell=True)
    print("OK")
    print("   Installing Postgres & Redis...", end=" ", flush=True)
    subprocess.run("apt-get install -y -qq postgresql postgresql-client redis-server build-essential libpq-dev zstd > /dev/null 2>&1", shell=True)
    print("OK")
    print("   Installing MinIO...", end=" ", flush=True)
    subprocess.run("wget -q https://dl.min.io/server/minio/release/linux-amd64/minio -O /usr/local/bin/minio && chmod +x /usr/local/bin/minio", shell=True)
    print("OK")
    print("✅ System dependencies installed.")

# ─── AUTO-REPAIR: OLLAMA ───
if subprocess.run("which ollama", shell=True).returncode != 0:
    print("⚠️ Ollama executable not found. Re-installing...")
    subprocess.run("curl -fsSL https://ollama.com/install.sh | sh", shell=True)
    print("✅ Ollama installed.")

def wait_for_service(port, name, timeout=60):
    """Wait for a local TCP port to be open."""
    print(f"  Waiting for {name} (:{port})...", end=" ", flush=True)
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            with socket.create_connection(("localhost", port), timeout=1):
                print("OK")
                return True
        except (OSError, ConnectionRefusedError):
            time.sleep(1)
    print("TIMEOUT")
    return False

print("=" * 60)
print("PHASE 4: INFRASTRUCTURE & SERVICES (Symlink Fix)")
print("=" * 60)

# ──────────── Cleanup ────────────
print("\n── Cleanup ──")
print("  Stopping existing services...", end=" ", flush=True)
# Use pkill which is safer/non-blocking compared to fuser/lsof loops
subprocess.run("pkill -f uvicorn", shell=True)
subprocess.run("pkill -f 'chroma run'", shell=True)
subprocess.run("pkill -f minio", shell=True)
# Give them a moment to die
time.sleep(3)
print("OK")

# ──────────── Infrastructure ────────────
print("\n── Infrastructure ──")

# PostgreSQL
print("  Starting PostgreSQL...", end=" ", flush=True)
subprocess.run("service postgresql start", shell=True, capture_output=True)
time.sleep(2)
# Ensure DB exists
subprocess.run(["sudo", "-u", "postgres", "psql", "-c", "CREATE USER agent_user WITH PASSWORD 'agent_pass' CREATEDB;"], capture_output=True)
subprocess.run(["sudo", "-u", "postgres", "psql", "-c", "CREATE DATABASE agentic_framework OWNER agent_user;"], capture_output=True)
subprocess.run(["sudo", "-u", "postgres", "psql", "-c", "GRANT ALL PRIVILEGES ON DATABASE agentic_framework TO agent_user;"], capture_output=True)
pg = subprocess.run(["sudo", "-u", "postgres", "psql", "-c", "SELECT 1;"], capture_output=True)
print("OK" if pg.returncode == 0 else "FAIL")

# Redis
print("  Starting Redis...", end=" ", flush=True)
subprocess.run("redis-server --daemonize yes --port 6379", shell=True, capture_output=True)
wait_for_service(6379, "Redis", timeout=10)

# ChromaDB
print("  Starting ChromaDB...", end=" ", flush=True)
os.makedirs("/tmp/chroma_data", exist_ok=True)
subprocess.Popen(
    ["chroma", "run", "--host", "0.0.0.0", "--port", "8001", "--path", "/tmp/chroma_data"],
    stdout=open("/tmp/chroma.log", "w"), stderr=subprocess.STDOUT
)
wait_for_service(8001, "ChromaDB")

# MinIO
# CHANGED: Using Port 9005 to avoid conflict with Jupyter (port 9000)
print("  Starting MinIO (Port 9005)...", end=" ", flush=True)
os.makedirs("/tmp/minio_data", exist_ok=True)
subprocess.Popen(
    ["/usr/local/bin/minio", "server", "/tmp/minio_data",
     "--address", ":9005", "--console-address", ":9001"],
    stdout=open("/tmp/minio.log", "w"), stderr=subprocess.STDOUT,
    env={**os.environ, "MINIO_ROOT_USER": "minioadmin", "MINIO_ROOT_PASSWORD": "minioadmin"}
)

if not wait_for_service(9005, "MinIO", timeout=90):
    print("\n  [ERROR] MinIO timed out. Last 20 lines of log:")
    subprocess.run("tail -n 20 /tmp/minio.log", shell=True)
    print("  Proceeding anyway (services may fail)...")
time.sleep(3)

# Ollama Check/Restart
print("  Checking Ollama...", end=" ", flush=True)
if not wait_for_service(11434, "Ollama", timeout=5):
    print("Restarting...", end=" ")
    subprocess.Popen(
        ["ollama", "serve"],
        stdout=open("/tmp/ollama.log", "w"),
        stderr=subprocess.STDOUT,
        env={**os.environ, "OLLAMA_HOST": "0.0.0.0:11434"}
    )
    wait_for_service(11434, "Ollama", timeout=20)

# ──────────── Environment Variables ────────────
# Global config for most services
env_vars = {
    "POSTGRES_URL": "postgresql://agent_user:agent_pass@localhost:5432/agentic_framework",
    "REDIS_URL": "redis://localhost:6379/0",
    "MCP_GATEWAY_URL": "http://localhost:8080",
    "MEMORY_SERVICE_URL": "http://localhost:8002",
    "SUBAGENT_MANAGER_URL": "http://localhost:8003",
    "CODE_EXECUTOR_URL": "http://localhost:8004",
    "CODE_EXECUTION_MODE": "local",
    "OLLAMA_ENDPOINT": "http://localhost:11434",
    "OLLAMA_BASE_URL": "http://localhost:11434",
    "LOCAL_MODEL": PRIMARY_MODEL,
    "FALLBACK_MODEL": FALLBACK_MODEL,
    "DEFAULT_LLM_PROVIDER": "local",
    "LLM_PROVIDER": "local",
    "USE_OPENCLAW": "false",
    "CHROMA_URL": "http://localhost:8001",
    "MINIO_ENDPOINT": "localhost:9005", # CHANGED: 9000 -> 9005
    "MINIO_ACCESS_KEY": "minioadmin",
    "MINIO_SECRET_KEY": "minioadmin",
    "JWT_SECRET_KEY": "colab-dev-secret-key-change-in-production",
    "ENVIRONMENT": "development",
    "PYTHONPATH": FRAMEWORK_DIR,
    "WORKSPACE_ROOT": f"{FRAMEWORK_DIR}/workspace",
    "WEBSOCKET_ENABLED": "true",
    "INDEX_CODEBASE": "true",
}

# Update OS env for convenience
for k, v in env_vars.items():
    os.environ[k] = v

# NOTE: We do NOT write .env file anymore to avoid Pydantic auto-loading unwanted vars
if os.path.exists(f"{FRAMEWORK_DIR}/.env"):
    os.remove(f"{FRAMEWORK_DIR}/.env")

# Create workspace dirs
for d in ["workspace/.copilot/memory/diary", "workspace/.copilot/memory/reflections", "workspace/ralph-work"]:
    os.makedirs(f"{FRAMEWORK_DIR}/{d}", exist_ok=True)

print("  Environment configured.")

# ──────────── Start Microservices ────────────
print("\n── Microservices ──")

# Prepare Base Env
base_env = {**os.environ}
base_env['PYTHONPATH'] = FRAMEWORK_DIR

services = [
    {"name": "Code Executor",    "module": "code_exec.service.main:app",        "port": 8004, "log": "/tmp/code_exec.log",        "env": {"REDIS_URL": "redis://localhost:6379/4"}},
    {"name": "Memory Service",   "module": "memory_service.service.main:app",   "port": 8002, "log": "/tmp/memory_service.log",   "env": {"REDIS_URL": "redis://localhost:6379/2"}},
    {"name": "SubAgent Manager", "module": "subagent_manager.service.main:app", "port": 8003, "log": "/tmp/subagent_manager.log", "env": {"REDIS_URL": "redis://localhost:6379/1"}},
    {"name": "MCP Gateway",      "module": "mcp_gateway.service.main:app",      "port": 8080, "log": "/tmp/mcp_gateway.log",      "env": {"REDIS_URL": "redis://localhost:6379/3"}},
    {"name": "Orchestrator",     "module": "orchestrator.service.main:app",     "port": 8000, "log": "/tmp/orchestrator.log",     "env": {}},
]

started = {}
for svc in services:
    print(f"  Starting {svc['name']} (:{svc['port']})...", end=" ", flush=True)

    # Special handling for Code Executor to prevent "Extra inputs" error
    if svc["name"] == "Code Executor":
        svc_env = base_env.copy()
        keys_to_remove = [
            'MINIO_ENDPOINT', 'MINIO_ACCESS_KEY', 'MINIO_SECRET_KEY',
            'JWT_SECRET_KEY', 'ENVIRONMENT', 'WORKSPACE_ROOT',
            'WEBSOCKET_ENABLED', 'INDEX_CODEBASE', 'PYTHONPATH'
        ]
        for k in keys_to_remove:
            if k in svc_env: del svc_env[k]
        svc_env["REDIS_URL"] = svc["env"]["REDIS_URL"]
        svc_env["CODE_EXECUTION_MODE"] = "local"
    else:
        svc_env = {**base_env, **svc["env"]}

    proc = subprocess.Popen(
        [sys.executable, "-m", "uvicorn", svc["module"],
         "--host", "0.0.0.0", "--port", str(svc["port"])],
        cwd=FRAMEWORK_DIR,
        stdout=open(svc["log"], "w"),
        stderr=subprocess.STDOUT,
        env=svc_env
    )
    started[svc["name"]] = proc.pid
    time.sleep(2)
    print(f"OK (PID {proc.pid})")

# ──────────── Dashboard ────────────
if START_DASHBOARD:
    print("\n── Dashboard ──")
    dashboard_dir = f"{FRAMEWORK_DIR}/dashboard"
    if os.path.exists(f"{dashboard_dir}/build"):
        print("  Serving pre-built dashboard (port 3000)...", end=" ", flush=True)
        subprocess.Popen(
            ["npx", "serve", "-s", "build", "-l", "3000"],
            cwd=dashboard_dir,
            stdout=open("/tmp/dashboard.log", "w"),
            stderr=subprocess.STDOUT,
            env={**os.environ, "PORT": "3000"}
        )
        time.sleep(2)
        print("OK")
    elif os.path.exists(f"{dashboard_dir}/package.json"):
        print("  Installing dashboard deps & starting (port 3000)...", end=" ", flush=True)
        subprocess.run(["npm", "install"], cwd=dashboard_dir, capture_output=True)
        subprocess.Popen(
            ["npm", "start"],
            cwd=dashboard_dir,
            stdout=open("/tmp/dashboard.log", "w"),
            stderr=subprocess.STDOUT,
            env={**os.environ, "PORT": "3000", "BROWSER": "none"}
        )
        time.sleep(5)
        print("OK")

# ──────────── Wait & Health Check ────────────
print("\n  Waiting 20s for services to initialize...")
time.sleep(20)

print("\n── Health Checks ──")
endpoints = [
    ("Orchestrator",    "http://localhost:8000/health"),
    ("Memory Service",  "http://localhost:8002/health"),
    ("SubAgent Manager","http://localhost:8003/health"),
    ("MCP Gateway",     "http://localhost:8080/health"),
    ("Code Executor",   "http://localhost:8004/health"),
    ("Ollama",          "http://localhost:11434/api/tags"),
]

all_ok = True
for name, url in endpoints:
    try:
        req = urllib.request.urlopen(url, timeout=5)
        print(f"  {name:20s} : OK ({req.getcode()})")
    except Exception as e:
        all_ok = False
        print(f"  {name:20s} : STARTING ({str(e)[:50]})")

if all_ok:
    print("\n  ALL SERVICES HEALTHY")
else:
    print("\n  Some services still starting.")

print("\n  Phase 4 complete.")
print("=" * 60)



  Verifying package symlinks...
  Created symlink: memory_service -> memory-service
  Created symlink: subagent_manager -> subagent-manager
  Created symlink: mcp_gateway -> mcp-gateway
  Created symlink: code_exec -> code-exec
  Symlinks verified.
PHASE 4: INFRASTRUCTURE & SERVICES (Symlink Fix)

── Cleanup ──
  Stopping existing services... OK

── Infrastructure ──
  Starting PostgreSQL... OK
  Starting Redis...   Waiting for Redis (:6379)... OK
  Starting ChromaDB...   Waiting for ChromaDB (:8001)... OK
  Starting MinIO (Port 9005)...   Waiting for MinIO (:9005)... OK
  Checking Ollama...   Waiting for Ollama (:11434)... OK
  Environment configured.

── Microservices ──
  Starting Code Executor (:8004)... OK (PID 10127)
  Starting Memory Service (:8002)... OK (PID 10141)
  Starting SubAgent Manager (:8003)... OK (PID 10151)
  Starting MCP Gateway (:8080)... OK (PID 10162)
  Starting Orchestrator (:8000)... OK (PID 10176)

── Dashboard ──
  Installing dashboard deps & starting (port

In [None]:
import urllib.request, json

print("=== CHECKING OLLAMA MODELS ===")
try:
    resp = urllib.request.urlopen("http://localhost:11434/api/tags")
    data = json.loads(resp.read().decode())
    models = [m['name'] for m in data.get('models', [])]
    if models:
        print(f"✅ Found {len(models)} models: {models}")
    else:
        print("❌ No models found! (They were likely wiped by the runtime reset)")
except Exception as e:
    print(f"Error checking models: {e}")

In [9]:
import os, subprocess

FRAMEWORK_DIR = "/content/ai_final/agentic-framework-main"

print("=== FILE STRUCTURE DIAGNOSTIC ===")
print(f"Framework Dir: {FRAMEWORK_DIR}")

# List root to see symlinks
subprocess.run(f"ls -la {FRAMEWORK_DIR}", shell=True)

print("\n--- Checking for __init__.py in services ---")
services_dirs = ["memory-service", "subagent-manager", "code-exec", "mcp-gateway"]
for d in services_dirs:
    path = os.path.join(FRAMEWORK_DIR, d)
    if os.path.exists(path):
        init_path = os.path.join(path, "__init__.py")
        has_init = os.path.exists(init_path)
        print(f"{d}: exists={'YES' if os.path.exists(path) else 'NO'}, has_init={'YES' if has_init else 'NO'}")
        if os.path.exists(path):
             subprocess.run(f"ls -F {path}", shell=True)
    else:
        print(f"{d}: MISSING")

print("\n=== FULL LOGS FOR FAILURES ===")
logs = ["/tmp/code_exec.log", "/tmp/subagent_manager.log"]
for log in logs:
    print(f"\n--- {log} ---")
    if os.path.exists(log):
        # Print last 100 lines
        subprocess.run(f"tail -n 100 {log}", shell=True)
    else:
        print("(File not found)")

=== FILE STRUCTURE DIAGNOSTIC ===
Framework Dir: /content/ai_final/agentic-framework-main

--- Checking for __init__.py in services ---
memory-service: exists=YES, has_init=YES
subagent-manager: exists=YES, has_init=YES
code-exec: exists=YES, has_init=YES
mcp-gateway: exists=YES, has_init=YES

=== FULL LOGS FOR FAILURES ===

--- /tmp/code_exec.log ---

--- /tmp/subagent_manager.log ---


In [8]:
import subprocess

print("=== CHECKING SERVICE LOGS FOR ERRORS ===")
services = [
    "/tmp/orchestrator.log",
    "/tmp/memory_service.log",
    "/tmp/code_exec.log",
    "/tmp/subagent_manager.log"
]

for log in services:
    print(f"\n--- {log} ---")
    # Check if file exists first
    try:
        # Print last 30 lines of the log
        result = subprocess.run(["tail", "-n", "30", log], capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print("STDERR:", result.stderr)
    except Exception as e:
        print(f"Could not read log: {e}")

=== CHECKING SERVICE LOGS FOR ERRORS ===

--- /tmp/orchestrator.log ---
INFO:     Started server process [8777]
INFO:     Waiting for application startup.
2026-02-08 03:07:30,427 - orchestrator.service.main - INFO - Starting Lead Agent/Orchestrator service...
2026-02-08 03:07:30,428 - orchestrator.service.main - INFO - Configuration: LLM Provider=local
2026-02-08 03:07:30,428 - orchestrator.service.main - INFO - MCP Gateway URL: http://localhost:8080
2026-02-08 03:07:30,428 - orchestrator.service.main - INFO - Memory Service URL: http://localhost:8002
2026-02-08 03:07:30,507 - orchestrator.service.main - INFO - WebSocket manager initialized
2026-02-08 03:07:30,509 - orchestrator.service.session_storage - INFO - Connected to Redis at redis://localhost:6379/0
2026-02-08 03:07:30,509 - orchestrator.service.agent - INFO - Session storage initialized
2026-02-08 03:07:30,510 - orchestrator.service.memory_learning - INFO - MemoryLearningClient initialized: memory_dir=/content/ai_final/agentic

In [6]:
import subprocess, os

print("=== PORT DIAGNOSTICS ===")
# Check what's listening on ports 9000 (MinIO) and 8004 (Code Exec)
for port in [9000, 9001, 8004]:
    print(f"\nChecking Port {port}...")
    # lsof -i :port
    res = subprocess.run(f"lsof -i :{port}", shell=True, capture_output=True, text=True)
    if res.stdout.strip():
        print(res.stdout)
    else:
        print("  (No process found listening)")

print("\n=== SERVICE LOGS (Last 50 lines) ===")
logs = ["/tmp/minio.log", "/tmp/code_exec.log", "/tmp/memory_service.log"]
for log in logs:
    print(f"\n--- {log} ---")
    if os.path.exists(log):
        # check file size
        size = os.path.getsize(log)
        print(f"  Size: {size} bytes")
        if size > 0:
            subprocess.run(f"tail -n 50 {log}", shell=True)
        else:
            print("  (Empty file)")
    else:
        print("  (File does not exist)")

=== PORT DIAGNOSTICS ===

Checking Port 9000...
COMMAND   PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
kernel_ma  12 root    9u  IPv4 549882      0t0  TCP b2412b48cfa2:57236->b2412b48cfa2:9000 (ESTABLISHED)
kernel_ma  12 root   10u  IPv4 550931      0t0  TCP b2412b48cfa2:57244->b2412b48cfa2:9000 (ESTABLISHED)
jupyter-s 101 root    7u  IPv4 549865      0t0  TCP b2412b48cfa2:9000 (LISTEN)
jupyter-s 101 root    8u  IPv4 549883      0t0  TCP b2412b48cfa2:9000->b2412b48cfa2:57236 (ESTABLISHED)
jupyter-s 101 root   16u  IPv4 550932      0t0  TCP b2412b48cfa2:9000->b2412b48cfa2:57244 (ESTABLISHED)


Checking Port 9001...
  (No process found listening)

Checking Port 8004...
  (No process found listening)

=== SERVICE LOGS (Last 50 lines) ===

--- /tmp/minio.log ---
  Size: 133 bytes

--- /tmp/code_exec.log ---
  Size: 3507 bytes

--- /tmp/memory_service.log ---
  Size: 3512 bytes


In [6]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 5: External Access (ngrok Tunnels)                   ║
# ╚══════════════════════════════════════════════════════════════╝
import os

print("=" * 60)
print("PHASE 5: EXTERNAL ACCESS")
print("=" * 60)

api_url = "http://localhost:8000"  # default fallback
dashboard_url = "http://localhost:3000"

if ENABLE_NGROK:
    try:
        from pyngrok import ngrok, conf, exception

        if NGROK_AUTH_TOKEN:
            ngrok.set_auth_token(NGROK_AUTH_TOKEN)
            print("  ngrok auth token set (stable URLs enabled)")
        else:
            print("  WARN: No NGROK_AUTH_TOKEN provided. Ngrok may fail if account is required.")

        # API tunnel
        print("  Creating tunnel for Orchestrator API (port 8000)...")
        try:
            api_tunnel = ngrok.connect(8000, "http")
            api_url = api_tunnel.public_url

            # Dashboard tunnel (if running)
            if START_DASHBOARD:
                print("  Creating tunnel for Dashboard (port 3000)...")
                dash_tunnel = ngrok.connect(3000, "http")
                dashboard_url = dash_tunnel.public_url

            os.environ["COLAB_API_URL"] = api_url
            os.environ["COLAB_DASHBOARD_URL"] = dashboard_url

            print("")
            print("╔══════════════════════════════════════════════════════════╗")
            print("║  PUBLIC ACCESS URLS (share these!)                      ║")
            print("╠══════════════════════════════════════════════════════════╣")
            print(f"║  API:        {api_url:<43s}║")
            print(f"║  API Docs:   {api_url + '/docs':<43s}║")
            print(f"║  Health:     {api_url + '/health':<43s}║")
            print(f"║  WebSocket:  {api_url.replace('http', 'ws') + '/ws':<43s}║")
            if START_DASHBOARD:
                print(f"║  Dashboard:  {dashboard_url:<43s}║")
            print("╚══════════════════════════════════════════════════════════╝")

        except Exception as e:
            if "ERR_NGROK_4018" in str(e) or "authentication failed" in str(e):
                print(f"\n  [ERROR] ngrok authentication failed. You need a valid NGROK_AUTH_TOKEN.")
                print("  Get one at https://dashboard.ngrok.com/signup")
                print("  Falling back to localhost (internal only).")
            else:
                print(f"\n  [ERROR] ngrok failed to start: {e}")
                print("  Falling back to localhost.")
            ENABLE_NGROK = False

    except ImportError:
        print("  pyngrok not installed. Skipping external access.")
        ENABLE_NGROK = False
else:
    print("  ngrok disabled. Services available at localhost only:")

if not ENABLE_NGROK:
    print("")
    print("  Local endpoints (inside Colab):")
    print("    Orchestrator:    http://localhost:8000")
    print("    Memory Service:  http://localhost:8002")
    print("    SubAgent Mgr:    http://localhost:8003")
    print("    MCP Gateway:     http://localhost:8080")
    print("    Code Executor:   http://localhost:8004")
    print("    Ollama LLM:      http://localhost:11434")
    if START_DASHBOARD:
        print("    Dashboard:       http://localhost:3000")

print("\n  Phase 5 complete.")
print("=" * 60)


PHASE 5: EXTERNAL ACCESS
  ngrok auth token set (stable URLs enabled)
  Creating tunnel for Orchestrator API (port 8000)...
  Creating tunnel for Dashboard (port 3000)...

╔══════════════════════════════════════════════════════════╗
║  PUBLIC ACCESS URLS (share these!)                      ║
╠══════════════════════════════════════════════════════════╣
║  API:        https://unliquid-blithely-glenda.ngrok-free.dev║
║  API Docs:   https://unliquid-blithely-glenda.ngrok-free.dev/docs║
║  Health:     https://unliquid-blithely-glenda.ngrok-free.dev/health║
║  WebSocket:  wss://unliquid-blithely-glenda.ngrok-free.dev/ws║
║  Dashboard:  https://unliquid-blithely-glenda.ngrok-free.dev║
╚══════════════════════════════════════════════════════════╝

  Phase 5 complete.


In [11]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 6: Quick Smoke Test                                  ║
# ╚══════════════════════════════════════════════════════════════╝
import json, urllib.request, time

print("=" * 60)
print("PHASE 6: SMOKE TEST")
print("=" * 60)

passed = 0
total = 0

def check(name, url):
    global passed, total
    total += 1
    try:
        r = urllib.request.urlopen(url, timeout=10)
        if r.getcode() == 200:
            passed += 1
            print(f"  [PASS] {name}")
        else:
             print(f"  [WARN] {name} (Status {r.getcode()})")
             # A 404 means the service is up but the path is wrong, which is better than a crash
             if r.getcode() == 404: passed += 1
    except Exception as e:
        print(f"  [FAIL] {name} — {str(e)[:60]}")

check("Orchestrator API",   "http://localhost:8000/health")
check("Memory Service",     "http://localhost:8002/health")
check("SubAgent Manager",   "http://localhost:8003/health")
check("MCP Gateway",        "http://localhost:8080/health")
check("Code Executor",      "http://localhost:8004/health")
check("Ollama LLM",         "http://localhost:11434/api/tags")

# Test LLM inference
total += 1
print("\n  Testing LLM inference (GPU)...", end=" ", flush=True)
try:
    t0 = time.time()
    data = json.dumps({
        "model": PRIMARY_MODEL,
        "prompt": "What is 2+2? Answer in one word.",
        "stream": False
    }).encode()
    req = urllib.request.Request(
        "http://localhost:11434/api/generate",
        data=data,
        headers={"Content-Type": "application/json"}
    )
    resp = urllib.request.urlopen(req, timeout=120)
    result = json.loads(resp.read().decode())
    elapsed = time.time() - t0
    passed += 1
    print(f"OK ({elapsed:.1f}s)")
    print(f"    Response: {result.get('response', '???')[:100].strip()}")
except Exception as e:
    print(f"FAIL — {str(e)[:80]}")

print(f"\n  Results: {passed}/{total} passed")
if passed >= total - 1: # Allow 1 minor failure (like 404 on health)
    print("  ALL SYSTEMS GO!")
else:
    print("  System operational but some checks failed.")
print("=" * 60)


PHASE 6: SMOKE TEST
  [PASS] Orchestrator API
  [PASS] Memory Service
  [PASS] SubAgent Manager
  [FAIL] MCP Gateway — HTTP Error 404: Not Found
  [PASS] Code Executor
  [PASS] Ollama LLM

  Testing LLM inference (GPU)... FAIL — HTTP Error 404: Not Found

  Results: 5/7 passed
  System operational but some checks failed.


In [13]:
import subprocess

print("=== CHECKING SERVICE LOGS FOR ERRORS ===")
services = [
    "/tmp/orchestrator.log",
    "/tmp/memory_service.log",
    "/tmp/code_exec.log",
    "/tmp/mcp_gateway.log"
]

for log in services:
    print(f"\n--- {log} ---")
    # Check if file exists first
    try:
        # Print last 30 lines of the log
        result = subprocess.run(["tail", "-n", "30", log], capture_output=True, text=True)
        print(result.stdout)
        if result.stderr:
            print("STDERR:", result.stderr)
    except Exception as e:
        print(f"Could not read log: {e}")


=== CHECKING SERVICE LOGS FOR ERRORS ===

--- /tmp/orchestrator.log ---
  File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1310, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1310, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "<frozen importlib._bootstrap>", line 1387, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1360, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 935, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 999, in e

In [8]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  PHASE 7: Keep-Alive (prevents Colab from disconnecting)    ║
# ╚══════════════════════════════════════════════════════════════╝
#
# This cell runs a background loop that:
#  1. Pings all services every 60 seconds
#  2. Auto-restarts any crashed service
#  3. Prints a status update every 5 minutes
#  4. Keeps the Colab runtime alive
#
# Stop it with: Runtime > Interrupt execution (or Ctrl+M I)
#
import subprocess, os, sys, time, urllib.request, json, signal
from datetime import datetime

FRAMEWORK_DIR = "/content/ai_final/agentic-framework-main"

service_defs = [
    {"name": "MCP Gateway",      "module": "mcp_gateway.service.main:app",      "port": 8080, "log": "/tmp/mcp_gateway.log",      "env": {"REDIS_URL": "redis://localhost:6379/3"}},
    {"name": "Memory Service",   "module": "memory_service.service.main:app",   "port": 8002, "log": "/tmp/memory_service.log",   "env": {"REDIS_URL": "redis://localhost:6379/2"}},
    {"name": "SubAgent Manager", "module": "subagent_manager.service.main:app", "port": 8003, "log": "/tmp/subagent_manager.log", "env": {"REDIS_URL": "redis://localhost:6379/1"}},
    {"name": "Code Executor",    "module": "code_exec.service.main:app",        "port": 8004, "log": "/tmp/code_exec.log",        "env": {"REDIS_URL": "redis://localhost:6379/4"}},
    {"name": "Orchestrator",     "module": "orchestrator.service.main:app",     "port": 8000, "log": "/tmp/orchestrator.log",     "env": {}},
]

def is_service_alive(port):
    try:
        url = f"http://localhost:{port}/health" if port != 11434 else f"http://localhost:{port}/api/tags"
        urllib.request.urlopen(url, timeout=5)
        return True
    except:
        return False

def restart_service(svc):
    """Restart a crashed service."""
    print(f"    Restarting {svc['name']} on port {svc['port']}...", end=" ", flush=True)
    svc_env = {**os.environ, **svc["env"]}
    proc = subprocess.Popen(
        [sys.executable, "-m", "uvicorn", svc["module"],
         "--host", "0.0.0.0", "--port", str(svc["port"])],
        cwd=FRAMEWORK_DIR,
        stdout=open(svc["log"], "a"),
        stderr=subprocess.STDOUT,
        env=svc_env
    )
    time.sleep(5)
    print(f"PID {proc.pid}")

print("=" * 60)
print("KEEP-ALIVE WATCHDOG STARTED")
print("  Monitoring services every 60s with auto-restart.")
print("  Status updates every 5 minutes.")
print("  Stop with: Runtime > Interrupt execution")
print("=" * 60)

cycle = 0
try:
    while True:
        cycle += 1
        restarts = 0

        # Check & auto-restart services
        for svc in service_defs:
            if not is_service_alive(svc["port"]):
                restart_service(svc)
                restarts += 1

        # Check Ollama
        if not is_service_alive(11434):
            print("    Restarting Ollama...", end=" ", flush=True)
            subprocess.Popen(
                ["ollama", "serve"],
                stdout=open("/tmp/ollama.log", "a"),
                stderr=subprocess.STDOUT,
                env={**os.environ, "OLLAMA_HOST": "0.0.0.0:11434"}
            )
            time.sleep(5)
            print("OK")
            restarts += 1

        # Status update every 5 minutes (every 5th cycle)
        if cycle % 5 == 0:
            now = datetime.now().strftime("%H:%M:%S")
            alive = sum(1 for s in service_defs if is_service_alive(s["port"]))
            ollama_ok = is_service_alive(11434)
            print(f"  [{now}] Services: {alive}/{len(service_defs)} | Ollama: {'OK' if ollama_ok else 'DOWN'} | Restarts this cycle: {restarts}")

        time.sleep(60)

except KeyboardInterrupt:
    print("\n  Watchdog stopped by user.")

KEEP-ALIVE WATCHDOG STARTED
  Monitoring services every 60s with auto-restart.
  Status updates every 5 minutes.
  Stop with: Runtime > Interrupt execution
    Restarting MCP Gateway on port 8080... PID 19057
    Restarting Memory Service on port 8002... PID 19086
    Restarting Code Executor on port 8004... PID 19108
    Restarting Orchestrator on port 8000... PID 19129
    Restarting MCP Gateway on port 8080... PID 19431
    Restarting Memory Service on port 8002... PID 19460
    Restarting Code Executor on port 8004... PID 19485
    Restarting Orchestrator on port 8000... PID 19511
    Restarting MCP Gateway on port 8080... PID 19813
    Restarting Memory Service on port 8002... PID 19840
    Restarting Code Executor on port 8004... PID 19865
    Restarting Orchestrator on port 8000... PID 19892
    Restarting MCP Gateway on port 8080... PID 20194
    Restarting Memory Service on port 8002... PID 20221
    Restarting Code Executor on port 8004... PID 20246
    Restarting Orchestrato

In [17]:
import subprocess

logs = ["/tmp/minio.log", "/tmp/code_exec.log", "/tmp/mcp_gateway.log"]

print("=== SERVICE LOGS ===")
for log in logs:
    print(f"\n--- {log} ---")
    try:
        # Check if file exists and has content
        if os.path.exists(log):
            with open(log, 'r') as f:
                content = f.read().strip()
                if content:
                    print(content[-2000:]) # Print last 2000 chars
                else:
                    print("(Empty file)")
        else:
            print("(File not found)")
    except Exception as e:
        print(f"Error reading {log}: {e}")

=== SERVICE LOGS ===

--- /tmp/minio.log ---
FATAL Unable to start the server: Specified port is already in use
      > Please ensure no other program uses the same address/port

--- /tmp/code_exec.log ---
http://localhost:8001', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
minio_endpoint
  Extra inputs are not permitted [type=extra_forbidden, input_value='localhost:9000', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
minio_access_key
  Extra inputs are not permitted [type=extra_forbidden, input_value='minioadmin', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
minio_secret_key
  Extra inputs are not permitted [type=extra_forbidden, input_value='minioadmin', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden
jwt_secret_key
  Extra inputs are not permitted [type=extra_forbidd

In [9]:
# ── Repair & Restart Services ──
import subprocess, time, sys, os, urllib.request

# Detect correct framework directory
POSSIBLE_DIRS = [
    "/content/ai_final/agentic-framework-main",
    "/content/ai_final"
]
FRAMEWORK_DIR = "/content/ai_final"
for d in POSSIBLE_DIRS:
    if os.path.exists(d) and os.path.exists(os.path.join(d, "orchestrator")):
        FRAMEWORK_DIR = d
        break

print(f"Using Framework Directory: {FRAMEWORK_DIR}")

services = [
    {"name": "Code Executor",    "module": "code_exec.service.main:app",        "port": 8004, "log": "/tmp/code_exec.log",        "env": {"REDIS_URL": "redis://localhost:6379/4"}},
    {"name": "Memory Service",   "module": "memory_service.service.main:app",   "port": 8002, "log": "/tmp/memory_service.log",   "env": {"REDIS_URL": "redis://localhost:6379/2"}},
    {"name": "SubAgent Manager", "module": "subagent_manager.service.main:app", "port": 8003, "log": "/tmp/subagent_manager.log", "env": {"REDIS_URL": "redis://localhost:6379/1"}},
    {"name": "MCP Gateway",      "module": "mcp_gateway.service.main:app",      "port": 8080, "log": "/tmp/mcp_gateway.log",      "env": {"REDIS_URL": "redis://localhost:6379/3"}},
    {"name": "Orchestrator",     "module": "orchestrator.service.main:app",     "port": 8000, "log": "/tmp/orchestrator.log",     "env": {}},
]

def check_port(port):
    try:
        urllib.request.urlopen(f"http://localhost:{port}/health", timeout=2)
        return True
    except:
        return False

print("Stopping any stuck services...")
subprocess.run(["pkill", "-f", "uvicorn"])
time.sleep(2)

print("Restarting services with log inspection...")
service_env = {**os.environ, "PYTHONPATH": FRAMEWORK_DIR}

for svc in services:
    print(f"Starting {svc['name']} (:{svc['port']})...", end=" ", flush=True)
    svc_env = {**service_env, **svc["env"]}

    # Start process
    subprocess.Popen(
        [sys.executable, "-m", "uvicorn", svc["module"],
         "--host", "0.0.0.0", "--port", str(svc["port"])],
        cwd=FRAMEWORK_DIR,
        stdout=open(svc["log"], "w"),
        stderr=subprocess.STDOUT,
        env=svc_env
    )

    # Wait and check
    time.sleep(4)
    if check_port(svc["port"]):
        print("OK")
    else:
        # Check if process is even running
        pid_check = subprocess.run(["pgrep", "-f", f"port {svc['port']}"], capture_output=True)
        if pid_check.returncode == 0:
             print("Running (but health check failed - still initializing?)")
        else:
             print("FAIL (Crashed)")
             print(f"--- Last 20 lines of {svc['log']} ---")
             if os.path.exists(svc["log"]):
                 subprocess.run(["tail", "-n", "20", svc["log"]])
             else:
                 print("Log file not found.")
             print("------------------------------------")

print("\nRepair complete. Try running Phase 6 (Smoke Test) again.")

Using Framework Directory: /content/ai_final/agentic-framework-main
Stopping any stuck services...
Restarting services with log inspection...
Starting Code Executor (:8004)... FAIL (Crashed)
--- Last 20 lines of /tmp/code_exec.log ---
------------------------------------
Starting Memory Service (:8002)... Running (but health check failed - still initializing?)
Starting SubAgent Manager (:8003)... OK
Starting MCP Gateway (:8080)... FAIL (Crashed)
--- Last 20 lines of /tmp/mcp_gateway.log ---
------------------------------------
Starting Orchestrator (:8000)... FAIL (Crashed)
--- Last 20 lines of /tmp/orchestrator.log ---
------------------------------------

Repair complete. Try running Phase 6 (Smoke Test) again.


---
## Utility Cells (run manually as needed)

The cells below are optional — run them when you want to interact with the system.

In [10]:
# ── Send a task to the Orchestrator ──
import json, urllib.request

task = "Write a Python function that calculates the Fibonacci sequence up to n terms, with proper error handling and type hints."

print(f"Task: {task}\n")
data = json.dumps({"message": task, "session_id": "colab-auto-001"}).encode()
req = urllib.request.Request(
    "http://localhost:8000/chat",
    data=data,
    headers={"Content-Type": "application/json"}
)
try:
    resp = urllib.request.urlopen(req, timeout=300)
    result = json.loads(resp.read().decode())
    print(json.dumps(result, indent=2)[:3000])
except Exception as e:
    print(f"Error: {e}")
    print("Tip: !tail -100 /tmp/orchestrator.log")

Task: Write a Python function that calculates the Fibonacci sequence up to n terms, with proper error handling and type hints.

Error: <urlopen error [Errno 111] Connection refused>
Tip: !tail -100 /tmp/orchestrator.log


In [11]:
# ── View service logs ──
# Change SERVICE to: orchestrator, memory_service, subagent_manager,
#                     mcp_gateway, code_exec, ollama, chroma, minio, dashboard
SERVICE = "orchestrator"
LINES = 50

import subprocess
print(f"Last {LINES} lines of {SERVICE}:")
print("=" * 60)
subprocess.run(["tail", f"-{LINES}", f"/tmp/{SERVICE}.log"], capture_output=False)

Last 50 lines of orchestrator:


CompletedProcess(args=['tail', '-50', '/tmp/orchestrator.log'], returncode=0)

In [12]:
# ── System resource monitor ──
import subprocess, psutil, shutil

print("GPU:")
subprocess.run("nvidia-smi", shell=True)

mem = psutil.virtual_memory()
print(f"\nRAM: {mem.used/1024**3:.1f}/{mem.total/1024**3:.1f} GB ({mem.percent}%)")

disk = shutil.disk_usage("/")
print(f"Disk: {(disk.total-disk.free)/1024**3:.1f}/{disk.total/1024**3:.1f} GB")

print("\nRunning services:")
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
    try:
        cmd = " ".join(proc.info.get('cmdline', []))
        if 'uvicorn' in cmd or 'ollama' in proc.info.get('name', '').lower():
            print(f"  PID {proc.info['pid']}: {cmd[:80]}")
    except:
        pass

GPU:

RAM: 1.6/12.7 GB (15.3%)
Disk: 57.0/235.7 GB

Running services:
  PID 21086: /usr/bin/python3 -m uvicorn memory_service.service.main:app --host 0.0.0.0 --por
  PID 21116: /usr/bin/python3 -m uvicorn subagent_manager.service.main:app --host 0.0.0.0 --p


In [None]:
# ── Restart all services ──
import psutil, time

print("Stopping all services...")
for proc in psutil.process_iter(['pid', 'cmdline']):
    try:
        cmd = " ".join(proc.info.get('cmdline', []))
        if 'uvicorn' in cmd and 'service.main' in cmd:
            proc.kill()
            print(f"  Killed PID {proc.info['pid']}")
    except:
        pass

time.sleep(3)
print("Done. Re-run Phase 4 cell to restart services.")

Stopping all services...
  Killed PID 14732
  Killed PID 14780
