# RAG API (Kaggle) — FastAPI + ngrok + Hugging Face local model

**What this does:**  
- Ingest a user-supplied PDF → split into chunks → embed → build FAISS index  
- Serve two endpoints:  
  - `POST /ingest` — upload/replace a PDF index  
  - `POST /ask` — ask a question; does Retrieval-Augmented Generation with your HF model  
- Expose the API publicly via ngrok (copy the URL into your Flutter app).

## 1) Setup & installs

In [None]:
# If running on Kaggle, these installs are okay (internet must be on).
# You can re-run if you see any transient errors.
!pip -q install fastapi==0.111.0 uvicorn==0.30.1 "python-multipart>=0.0.9"                pypdf==4.3.1 faiss-cpu==1.8.0.post1                sentence-transformers==3.0.1                transformers==4.43.3 accelerate==0.33.0 bitsandbytes==0.43.3                langchain==0.2.6                pyngrok==7.2.3

## 2) Environment variables

In [None]:
import os

# ======== REQUIRED: set these ========
# Hugging Face token if your model is gated/private
os.environ.setdefault("HF_TOKEN", "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")

# Your Hugging Face model id (local model), e.g. "HatemAhmed44/Egiptura_AI"
os.environ.setdefault("MODEL_ID", "mistralai/Mistral-Nemo-Instruct-2407")

# Ngrok authtoken
os.environ.setdefault("NGROK_AUTHTOKEN", "32JYankevjVACV9Gy2F0KQ2YhJ8_6Fne9bccQciyPpEtcJAKR")

# Server port (match Flutter if you hardcode)
os.environ.setdefault("PORT", "8000")

# Embedding model (sentence-transformers)
os.environ.setdefault("EMBEDDINGS_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

# Storage paths
os.environ.setdefault("INDEX_DIR", "/kaggle/working/rag_index")
os.environ.setdefault("UPLOADS_DIR", "/kaggle/working/uploads")
os.environ.setdefault("CACHE_DIR", "/kaggle/working/hf_cache")

## 3) Imports & helpers

In [None]:
import os, shutil, json, time, threading, uuid
from typing import Optional, List

from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from pypdf import PdfReader

# Embeddings / FAISS
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# HF text-generation
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

# ngrok
from pyngrok import ngrok

# Ensure dirs
INDEX_DIR = os.getenv("INDEX_DIR")
UPLOADS_DIR = os.getenv("UPLOADS_DIR")
CACHE_DIR = os.getenv("CACHE_DIR")
for d in (INDEX_DIR, UPLOADS_DIR, CACHE_DIR):
    os.makedirs(d, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunk_text(text: str, chunk_size: int = 800, overlap: int = 120) -> List[str]:
    '''
    Simple fixed-size chunking to avoid heavy dependencies.
    '''
    tokens = text.split()
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(len(tokens), start + chunk_size)
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)
        if (chunk_size - overlap) <= 0:
            break
    return chunks

## 4) Load embeddings + build/load FAISS index

In [None]:
EMBED_MODEL_ID = os.getenv("EMBEDDINGS_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
_embedder = SentenceTransformer(EMBED_MODEL_ID)

def _index_paths():
    return (os.path.join(INDEX_DIR, "faiss.index"),
            os.path.join(INDEX_DIR, "meta.json"))

def has_index() -> bool:
    idx_path, meta_path = _index_paths()
    return os.path.exists(idx_path) and os.path.exists(meta_path)

def reset_index_dir():
    if os.path.exists(INDEX_DIR):
        shutil.rmtree(INDEX_DIR, ignore_errors=True)
    os.makedirs(INDEX_DIR, exist_ok=True)

def build_index_from_chunks(chunks: List[str]):
    reset_index_dir()
    idx_path, meta_path = _index_paths()
    # Embed
    embs = _embedder.encode(chunks, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
    dim = embs.shape[1]
    index = faiss.IndexFlatIP(dim)
    # Normalize for cosine
    faiss.normalize_L2(embs)
    index.add(embs)
    faiss.write_index(index, idx_path)
    with open(meta_path, "w", encoding="utf-8") as f:
        json.dump({"chunks": chunks}, f, ensure_ascii=False)

def load_index():
    idx_path, meta_path = _index_paths()
    index = faiss.read_index(idx_path)
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
    return index, meta["chunks"]

def search_chunks(query: str, k: int = 5):
    if not has_index():
        return []
    index, chunks = load_index()
    q_emb = _embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    I = I[0].tolist()
    top_chunks = [chunks[i] for i in I if i < len(chunks)]
    return top_chunks

## 5) Load local HF model for generation

In [None]:
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_ID = os.getenv("MODEL_ID", "mistralai/Mistral-Nemo-Instruct-2407")

# Use a standard causal LM interface
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    use_auth_token=HF_TOKEN,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None
)

def generate_answer(prompt: str, max_new_tokens: int = 256, temperature: float = 0.2, top_p: float = 0.95):
    input_ids = tok.encode(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tok.eos_token_id
        )
    text = tok.decode(out[0], skip_special_tokens=True)
    # Return only the newly generated continuation
    return text[len(tok.decode(input_ids[0], skip_special_tokens=True)) : ].strip()

## 6) FastAPI app + endpoints `/ingest` and `/ask`

In [None]:
from fastapi import FastAPI
from fastapi.responses import JSONResponse

app = FastAPI(title="Egiptura RAG API", version="1.0")

# CORS: allow all for easy Flutter dev; lock down later if needed
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class AskRequest(BaseModel):
    question: str
    lang: Optional[str] = None  # "es", "ar", "en" ... optional

@app.get("/health")
def health():
    return {"ok": True, "has_index": has_index()}

@app.post("/ingest")
async def ingest_pdf(file: UploadFile = File(...)):
    # Save uploaded PDF
    file_id = str(uuid.uuid4())
    save_path = os.path.join(UPLOADS_DIR, f"{file_id}.pdf")
    with open(save_path, "wb") as f:
        f.write(await file.read())

    # Extract text
    reader = PdfReader(save_path)
    pages = []
    for p in reader.pages:
        try:
            pages.append(p.extract_text() or "")
        except Exception:
            pages.append("")
    full_text = "\n".join(pages)

    if not full_text.strip():
        raise HTTPException(status_code=400, detail="PDF text is empty or could not be extracted.")

    # Chunk + index
    chunks = chunk_text(full_text, chunk_size=180, overlap=40)
    if len(chunks) == 0:
        raise HTTPException(status_code=400, detail="No chunks could be created from the PDF.")

    build_index_from_chunks(chunks)
    return {"ok": True, "chunks": len(chunks)}

@app.post("/ask")
async def ask(req: AskRequest):
    q = (req.question or "").strip()
    if not q:
        raise HTTPException(status_code=400, detail="Empty question")

    # Retrieve
    ctx_chunks = search_chunks(q, k=6)

    # Build prompt
    context_block = "\n\n".join([f"- {c}" for c in ctx_chunks])
    system_rules = (
        "You are Egiptura's travel assistant. Answer ONLY from the provided context. "
        "If the answer is not in the context, say you don't have enough information."
    )
    prompt = f"""{system_rules}

Context:
{context_block}

Question: {q}
Answer in the same language as the question, concise and well-structured.
"""

    answer = generate_answer(prompt, max_new_tokens=256, temperature=0.2, top_p=0.95)
    return {"answer": answer, "used_chunks": len(ctx_chunks)}

## 7) Launch server + ngrok tunnel

In [None]:
import uvicorn, os
from threading import Thread

PORT = int(os.getenv("PORT", "8000"))
NGROK_AUTHTOKEN = os.getenv("NGROK_AUTHTOKEN")

# Start uvicorn in a thread
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info")

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

# Start ngrok
if NGROK_AUTHTOKEN and NGROK_AUTHTOKEN != "32JYankevjVACV9Gy2F0KQ2YhJ8_6Fne9bccQciyPpEtcJAKR":
    ngrok.set_auth_token(NGROK_AUTHTOKEN)
public_url = ngrok.connect(addr=PORT, proto="http").public_url
public_url

## 8) Quick test cells

In [None]:
# 8.1 Health check (after server is up)
import requests, time
time.sleep(2)
try:
    r = requests.get(f"{public_url}/health", timeout=10)
    r.json()
except Exception as e:
    print("Health check failed:", e)

In [None]:
# 8.2 Example: Ingest a PDF programmatically (if you have a local path on Kaggle)
# Replace 'path_to_pdf' with your file path in Kaggle environment.
# import requests
# files = {"file": open("/kaggle/input/some.pdf", "rb")}
# r = requests.post(f"{public_url}/ingest", files=files)
# r.json()

In [None]:
# 8.3 Ask a question
# Replace with your own question once the index is built.
# data = {"question": "¿Qué incluye el programa de crucero por el Nilo?"}
# r = requests.post(f"{public_url}/ask", json=data)
# r.json()