In [1]:
# Cell 1: Setup
import os
import faiss
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown
import ipywidgets as widgets
import json
from datetime import datetime

# === Configuration ===
BASE_DIR = os.getcwd()
INDEX_PATH = os.path.join(BASE_DIR, "faiss_index.bin")
METADATA_PATH = os.path.join(BASE_DIR, "chunk_metadata.pkl")
CHUNK_DIR = os.path.join(BASE_DIR, "arxiv_chunks")
LOG_PATH = os.path.join(BASE_DIR, "retrieval_log.json")

# === Load Model, Index, Metadata ===
model = SentenceTransformer("all-MiniLM-L6-v2")
index = faiss.read_index(INDEX_PATH)

with open(METADATA_PATH, "rb") as f:
    chunk_ids = pickle.load(f)

print(f"FAISS index loaded with {len(chunk_ids)} chunks.")

FAISS index loaded with 1177 chunks.


In [2]:
# Cell 2: Search and Logging Functions

def search_query(query, top_k=3):
    query_embedding = model.encode([query]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for i in indices[0]:
        chunk_file = os.path.join(CHUNK_DIR, chunk_ids[i])
        with open(chunk_file, "r", encoding="utf-8") as f:
            chunk_text = f.read().strip()
        results.append((chunk_ids[i], chunk_text))
    return results

def save_results_to_json(query, results):
    entry = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "results": [
            {
                "chunk_id": chunk_id,
                "text": text
            } for chunk_id, text in results
        ]
    }

    # Load existing log or create new
    if os.path.exists(LOG_PATH):
        with open(LOG_PATH, "r", encoding="utf-8") as f:
            log = json.load(f)
    else:
        log = []

    log.append(entry)

    with open(LOG_PATH, "w", encoding="utf-8") as f:
        json.dump(log, f, indent=2)

In [10]:
# Cell 3: Interactive UI with Submit Button

query_box = widgets.Text(
    value='transformer models for machine translation',
    placeholder='Type your query here',
    description='Query:',
    layout=widgets.Layout(width='70%'),
    style={'description_width': 'initial'}
)

submit_button = widgets.Button(
    description='Submit',
    button_style='success',
    layout=widgets.Layout(width='15%')
)

output_area = widgets.Output()

def on_button_click(b):
    output_area.clear_output()
    query = query_box.value.strip()
    if not query:
        with output_area:
            display(Markdown("**Please enter a query.**"))
        return
    results = search_query(query, top_k=3)
    save_results_to_json(query, results)
    with output_area:
        for i, (chunk_id, text) in enumerate(results, 1):
            display(Markdown(f"### Result {i}: `{chunk_id}`"))
            display(Markdown(f"```\n{text[:1000]}\n```"))

submit_button.on_click(on_button_click)

ui = widgets.VBox([
    widgets.HBox([query_box, submit_button]),
    output_area
])

display(ui)


VBox(children=(HBox(children=(Text(value='transformer models for machine translation', description='Query:', l…