# Embedding Service PoC - Colab Setup

This notebook sets up and runs the embedding service comparison on Colab with T4 GPU.

**Runtime: GPU (T4)**

## 1. Check GPU

In [None]:
!nvidia-smi

## 2. Clone Repository

In [None]:
!https://github.com/karan-mudaliar/embedding_service_poc.git
%cd embedding_service_poc

## 3. Install Dependencies

In [None]:
!pip install -q fastapi uvicorn pydantic tyro
!pip install -q langchain langchain-community
!pip install -q sentence-transformers transformers
!pip install -q torch torchvision
!pip install -q httpx numpy datasets psutil
!pip install -q structlog colorama
!pip install -q vllm

## 4. Prepare Test Dataset

In [None]:
!python data_loader.py

## 5. Test HuggingFace Backend

In [None]:
# Start service in background
import subprocess
import time

# Start HuggingFace service
hf_process = subprocess.Popen(['python', 'service_huggingface.py'])
print("Waiting for service to start...")
time.sleep(30)
print("Service should be ready!")

In [None]:
# Health check
import requests
response = requests.get("http://localhost:8000/health")
print(response.json())

In [None]:
# Run stress test (10 minutes)
!python stress_test.py --service-url http://localhost:8000 --duration-minutes 10 --batch-size 32 --max-concurrent-requests 10

In [None]:
# Stop HuggingFace service
hf_process.terminate()
hf_process.wait()
print("HuggingFace service stopped")

## 6. Test vLLM Backend

In [None]:
# Start vLLM service
vllm_process = subprocess.Popen(['python', 'service_vllm.py'])
print("Waiting for vLLM service to start...")
time.sleep(60)  # vLLM takes longer to initialize
print("Service should be ready!")

In [None]:
# Health check
response = requests.get("http://localhost:8001/health")
print(response.json())

In [None]:
# Run stress test (10 minutes)
!python stress_test.py --service-url http://localhost:8001 --duration-minutes 10 --batch-size 32 --max-concurrent-requests 10

In [None]:
# Stop vLLM service
vllm_process.terminate()
vllm_process.wait()
print("vLLM service stopped")

## 7. Compare Results

In [None]:
import json
import glob

# Load all result files
result_files = sorted(glob.glob("stress_test_results_*.json"))

print("\n=" * 60)
print("COMPARISON: HuggingFace vs vLLM")
print("=" * 60)

for i, file in enumerate(result_files[-2:]):
    with open(file) as f:
        results = json.load(f)
    
    backend = "HuggingFace" if i == 0 else "vLLM"
    print(f"\n{backend} Backend:")
    print(f"  Requests/sec: {results['requests_per_second']:.2f}")
    print(f"  Success rate: {results['success_rate_percent']:.2f}%")
    print(f"  P50 latency: {results['latency_metrics']['p50']:.4f}s")
    print(f"  P90 latency: {results['latency_metrics']['p90']:.4f}s")
    print(f"  P99 latency: {results['latency_metrics']['p99']:.4f}s")

print("\n" + "=" * 60)

## 8. Download Results

In [None]:
from google.colab import files

# Download all result files
for file in result_files:
    files.download(file)