# Build FAISS + BM25 Index on GPU (Colab/Kaggle)

Run this notebook on **Google Colab** (T4 GPU, free tier is enough) or **Kaggle** (P100).

## Steps:
1. Upload `corpus.zip` to Colab or mount from Drive
2. Run all cells
3. Download `index.zip` at the end
4. Upload it as a **GitHub Release asset** on your repo
5. Update the `INDEX_RELEASE_URL` in `Dockerfile` (see README)

Expected time: ~5-10 min on T4 GPU for the full corpus.

In [None]:
# ── 1. Install dependencies ──────────────────────────────────────────────────
!pip install -q sentence-transformers faiss-gpu rank-bm25 numpy

In [None]:
# ── 2. Check GPU ─────────────────────────────────────────────────────────────
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# ── 3. Upload and extract corpus ─────────────────────────────────────────────
import os, zipfile, json, re, pickle
from pathlib import Path

# Option A: upload corpus.zip manually via Colab file browser
# Option B: mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# CORPUS_ZIP = '/content/drive/MyDrive/corpus.zip'

CORPUS_ZIP  = '/content/corpus.zip'   # change if needed
CORPUS_DIR  = Path('/content/corpus')
INDEX_DIR   = Path('/content/index')
INDEX_DIR.mkdir(exist_ok=True)

if not CORPUS_DIR.exists():
    print(f'Extracting {CORPUS_ZIP}...')
    with zipfile.ZipFile(CORPUS_ZIP, 'r') as z:
        z.extractall('/content/')
    print('Done.')

# List files found
files = list(CORPUS_DIR.rglob('*.jsonl')) + list(CORPUS_DIR.rglob('*.json'))
print(f'Found {len(files)} corpus files')

In [None]:
# ── 4. Load protocols ────────────────────────────────────────────────────────
def load_protocols(corpus_path: Path) -> list[dict]:
    protocols = []
    for fpath in sorted(corpus_path.rglob('*.jsonl')):
        with open(fpath, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    protocols.append(json.loads(line))
    for fpath in sorted(corpus_path.rglob('*.json')):
        with open(fpath, encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                protocols.extend(data)
            elif isinstance(data, dict):
                protocols.append(data)
    print(f'Loaded {len(protocols)} protocols')
    return protocols

protocols = load_protocols(CORPUS_DIR)

In [None]:
# ── 5. Chunking ──────────────────────────────────────────────────────────────
CHUNK_SIZE    = 600   # words
CHUNK_OVERLAP = 100

# Split on protocol section headers (roman numerals or numbered)
SECTION_RE = re.compile(
    r'(?:^|\n)(?=(?:I{1,3}V?|VI{0,3}|[1-9]\d*)\.\s+[А-ЯA-Z])',
    re.MULTILINE,
)

def chunk_by_sections(text: str) -> list[str]:
    sections = SECTION_RE.split(text)
    sections = [s.strip() for s in sections if s.strip()]
    chunks = []
    for section in sections:
        words = section.split()
        if len(words) <= CHUNK_SIZE:
            chunks.append(section)
        else:
            start = 0
            while start < len(words):
                end = min(start + CHUNK_SIZE, len(words))
                chunks.append(' '.join(words[start:end]))
                if end == len(words):
                    break
                start += CHUNK_SIZE - CHUNK_OVERLAP
    return chunks

def extract_icd(text: str) -> list[str]:
    return list(set(re.findall(r'\b[A-Z]\d{2}(?:\.\d{1,2})?\b', text)))

# Build chunk list
all_chunks = []
for proto in protocols:
    pid   = proto.get('protocol_id', '')
    src   = proto.get('source_file', '')
    title = proto.get('title', '')
    icds  = proto.get('icd_codes', [])
    text  = proto.get('text', '')
    all_icds = list(set(icds + extract_icd(text)))
    for idx, chunk in enumerate(chunk_by_sections(text)):
        all_chunks.append({
            'protocol_id': pid,
            'source_file': src,
            'title': title,
            'icd_codes': all_icds,
            'chunk': chunk,
            'chunk_idx': idx,
        })

print(f'Total chunks: {len(all_chunks)}')
print(f'Sample chunk (first 200 chars): {all_chunks[0]["chunk"][:200]}')

In [None]:
# ── 6. Embed with multilingual-e5-small on GPU ───────────────────────────────
from sentence_transformers import SentenceTransformer
import numpy as np

EMBED_MODEL = 'intfloat/multilingual-e5-small'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Embedding on: {device}')

model = SentenceTransformer(EMBED_MODEL, device=device)

# E5 requires 'passage: ' prefix for corpus chunks
texts = ['passage: ' + c['chunk'] for c in all_chunks]

print(f'Embedding {len(texts)} chunks...')
embeddings = model.encode(
    texts,
    batch_size=128,          # GPU can handle larger batches
    normalize_embeddings=True,
    show_progress_bar=True,
)
embeddings = np.array(embeddings, dtype='float32')
print(f'Embeddings shape: {embeddings.shape}')

In [None]:
# ── 7. Build and save FAISS index ────────────────────────────────────────────
import faiss

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)   # inner product = cosine on normalized vecs
index.add(embeddings)
print(f'FAISS index: {index.ntotal} vectors (dim={dim})')

faiss.write_index(index, str(INDEX_DIR / 'faiss.index'))
print('Saved faiss.index')

# Save metadata (parallel to FAISS vectors)
with open(INDEX_DIR / 'metadata.pkl', 'wb') as f:
    pickle.dump(all_chunks, f)
print('Saved metadata.pkl')

In [None]:
# ── 8. Build and save BM25 index ─────────────────────────────────────────────
from rank_bm25 import BM25Okapi

def tokenize(text: str) -> list[str]:
    text = text.lower()
    tokens = re.split(r'\s+', text)
    return [t.strip('.,;:!?()[]') for t in tokens if t.strip('.,;:!?()[]')]

print('Building BM25 index...')
tokenized_corpus = [tokenize(c['chunk']) for c in all_chunks]
bm25 = BM25Okapi(tokenized_corpus)

with open(INDEX_DIR / 'bm25.pkl', 'wb') as f:
    pickle.dump({'bm25': bm25, 'chunks': all_chunks}, f)
print('Saved bm25.pkl')

In [None]:
# ── 9. Quick sanity check ────────────────────────────────────────────────────
test_query = 'кашель с мокротой температура боль в грудной клетке'

# FAISS
q_vec = model.encode(['query: ' + test_query], normalize_embeddings=True)
scores, indices = index.search(q_vec.astype('float32'), 3)
print('\n=== FAISS top-3 ===')
for score, idx in zip(scores[0], indices[0]):
    c = all_chunks[idx]
    print(f'  {score:.3f} | {c["source_file"]} | ICD: {c["icd_codes"][:3]}')
    print(f'         {c["chunk"][:120]}...')

# BM25
bm25_scores = bm25.get_scores(tokenize(test_query))
top3 = np.argsort(bm25_scores)[::-1][:3]
print('\n=== BM25 top-3 ===')
for idx in top3:
    c = all_chunks[idx]
    print(f'  {bm25_scores[idx]:.3f} | {c["source_file"]} | ICD: {c["icd_codes"][:3]}')

In [None]:
# ── 10. Package and download ─────────────────────────────────────────────────
import shutil

output_zip = '/content/index.zip'
shutil.make_archive('/content/index', 'zip', '/content/index')

# Check size
size_mb = os.path.getsize(output_zip) / 1024 / 1024
print(f'index.zip size: {size_mb:.1f} MB')

# Download in Colab
try:
    from google.colab import files
    files.download(output_zip)
    print('Download started!')
except ImportError:
    print(f'Not in Colab — find the file at: {output_zip}')
    print('In Kaggle: go to Output tab to download index.zip')

## After downloading `index.zip`

1. Go to your GitHub repo → **Releases** → **Draft a new release**
2. Tag: `v1.0-index` (or any tag)
3. Attach `index.zip` as a release asset
4. Publish the release
5. Copy the direct download URL of `index.zip`
6. Set `INDEX_RELEASE_URL` in your `Dockerfile` (see next cell for the URL format)

URL format:
```
https://github.com/<your-org>/<your-repo>/releases/download/v1.0-index/index.zip
```