In [None]:
# Install dependencies
!pip install pyngrok fastapi uvicorn python-multipart sentence-transformers faiss-cpu ollama --quiet

# Install Ollama
print("ðŸ“¥ Installing Ollama...")
!curl -fsSL https://ollama.com/install.sh | sh

ðŸ“¥ Installing Ollama...
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
# Start Ollama server in background
import subprocess
import time
print("ðŸš€ Starting Ollama server...")
ollama_process = subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
time.sleep(5)

# Pull TinyLlama model
print("ðŸ“¥ Downloading TinyLlama model via Ollama...")
!ollama pull tinyllama
print("âœ… Model downloaded!")


ðŸš€ Starting Ollama server...
ðŸ“¥ Downloading TinyLlama model via Ollama...
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l
âœ… Model downloaded!


In [None]:

import os, threading, socket, logging
from typing import List
from pyngrok import ngrok, conf
from fastapi import FastAPI, Form, HTTPException, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from sentence_transformers import SentenceTransformer
import faiss
import ollama

In [None]:

# ========================
# CONFIGURATION - CHANGE YOUR TOKEN HERE
# ========================
NGROK_TOKEN = "your_first_ngrok_token_here"  # Get from https://dashboard.ngrok.com
API_KEY = "123456"  # Must match Streamlit and Kaggle
CHUNK_SIZE = 400
CHUNK_OVERLAP = 40

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
# ========================
# MODELS
# ========================
class Models:
    embedder = None

    @classmethod
    def load(cls):
        logger.info("ðŸ”„ Loading Colab RAG models...")

        # Load embedding model
        logger.info("Loading embeddings...")
        cls.embedder = SentenceTransformer('all-MiniLM-L6-v2')
        logger.info("âœ… Embedder loaded")

        # Test Ollama connection
        try:
            ollama.list()
            logger.info("âœ… Ollama connected with TinyLlama")
        except Exception as e:
            logger.error(f"Ollama connection failed: {e}")
            raise

        logger.info("âœ… All Colab RAG models ready!")


In [None]:
# SECURITY
# ========================
security = HTTPBearer()

def verify(creds: HTTPAuthorizationCredentials = Depends(security)):
    if creds.credentials != API_KEY:
        raise HTTPException(401, "Unauthorized")
    return creds.credentials

In [None]:
# ========================
# RAG FUNCTIONS
# ========================
def chunk_text(text: str) -> List[str]:
    """Split text into overlapping chunks"""
    words = text.split()
    if len(words) < CHUNK_SIZE:
        return [text]

    chunks = []
    for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
        chunk = " ".join(words[i:i + CHUNK_SIZE])
        if len(chunk.split()) > 20:  # Minimum chunk size
            chunks.append(chunk)

    return chunks if chunks else [text]

def build_index(chunks: List[str]):
    """Build FAISS index from chunks"""
    embeddings = Models.embedder.encode(chunks, show_progress_bar=False)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings.astype('float32'))
    return index

def retrieve(query: str, index, chunks: List[str], k: int = 3) -> List[str]:
    """Retrieve top-k relevant chunks"""
    query_embedding = Models.embedder.encode([query], show_progress_bar=False).astype('float32')
    _, indices = index.search(query_embedding, min(k, len(chunks)))
    return [chunks[i] for i in indices[0] if i < len(chunks)]

def generate_answer(prompt: str) -> str:
    """Generate answer with Ollama TinyLlama"""
    try:
        response = ollama.generate(
            model='tinyllama',
            prompt=prompt,
            options={
                'temperature': 0.3,
                'num_predict': 250,
                'stop': ['\n\n', 'Question:', 'Context:']
            }
        )
        return response['response'].strip()
    except Exception as e:
        logger.error(f"Ollama generation error: {e}")
        raise HTTPException(500, f"Answer generation failed: {str(e)}")


In [None]:
# ========================
# FASTAPI APP
# ========================
app = FastAPI(title="Colab RAG API with Ollama", version="1.0")

@app.on_event("startup")
async def startup():
    Models.load()

@app.get("/")
async def root():
    return {
        "status": "online",
        "service": "Colab RAG Server (Ollama)",
        "capabilities": ["rag", "question_answering"],
        "model": "tinyllama via Ollama"
    }

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "server": "colab",
        "models": {
            "embedder": Models.embedder is not None,
            "ollama": "tinyllama"
        }
    }

@app.post("/rag")
async def rag(
    text: str = Form(...),
    question: str = Form(...),
    token: str = Depends(verify)
):
    """Answer question using RAG pipeline with Ollama"""
    try:
        if not question or not text:
            raise HTTPException(400, "Both text and question are required")

        logger.info(f"Processing RAG query: {question[:50]}...")

        # Step 1: Chunk the text
        chunks = chunk_text(text)
        logger.info(f"Created {len(chunks)} chunks")

        # Step 2: Build search index
        index = build_index(chunks)

        # Step 3: Retrieve relevant chunks
        sources = retrieve(question, index, chunks, k=3)
        logger.info(f"Retrieved {len(sources)} relevant chunks")

        # Step 4: Build context
        context = "\n\n".join([f"[{i+1}] {chunk[:300]}" for i, chunk in enumerate(sources)])

        # Step 5: Generate answer with Ollama
        prompt = f"""Based on the following context, answer the question concisely and accurately.

Context:
{context}

Question: {question}

Answer:"""

        answer = generate_answer(prompt)
        logger.info("âœ… Answer generated with Ollama")

        return {
            "status": "success",
            "answer": answer,
            "sources": sources,
            "num_chunks": len(chunks),
            "question": question,
            "model": "tinyllama (Ollama)"
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"RAG error: {e}")
        raise HTTPException(500, f"RAG processing failed: {str(e)}")


        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


In [None]:
# START SERVER WITH NGROK
# ========================
def get_port():
    with socket.socket() as s:
        s.bind(('', 0))
        return s.getsockname()[1]

# Setup ngrok
port = get_port()
conf.get_default().auth_token = NGROK_TOKEN
url = ngrok.connect(port).public_url

print("\n" + "="*70)
print("ðŸŽ‰ COLAB RAG SERVER IS READY (USING OLLAMA)!")
print("="*70)
print(f"ðŸ”— COLAB URL: {url}")
print(f"ðŸ”‘ API KEY: {API_KEY}")
print(f"ðŸ¤– MODEL: TinyLlama via Ollama")
print("="*70)
print("\nðŸ“‹ COPY THE URL ABOVE TO YOUR STREAMLIT APP!")
print("="*70 + "\n")

# Start FastAPI server
threading.Thread(
    target=lambda: __import__('uvicorn').run(app, host="0.0.0.0", port=port),
    daemon=True
).start()

# Keep running
logger.info("âœ… Colab RAG server with Ollama is running!")
while True:
    time.sleep(1)


ðŸŽ‰ COLAB RAG SERVER IS READY (USING OLLAMA)!
ðŸ”— COLAB URL: https://simply-nonpersistent-irena.ngrok-free.dev
ðŸ”‘ API KEY: 123456
ðŸ¤– MODEL: TinyLlama via Ollama

ðŸ“‹ COPY THE URL ABOVE TO YOUR STREAMLIT APP!



INFO:     Started server process [790]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:41585 (Press CTRL+C to quit)


INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
INFO:     156.203.167.177:0 - "GET /health HTTP/1.1" 200 OK
