In [4]:
# ----------------------------
# 0. Imports
# ----------------------------
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# ----------------------------
# 1. PDF LOADING + CHUNKING
# ----------------------------
def load_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def chunk_text(text, chunk_size=300, overlap=50):
    """
    Splits text into smaller chunks of ~chunk_size tokens
    with some overlap for context continuity.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# ----------------------------
# 2. EMBEDDINGS + FAISS INDEX
# ----------------------------
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
chat_history = []

MODEL_MAX_TOKENS = 512
MAX_NEW_TOKENS = 128
MAX_INPUT_TOKENS = MODEL_MAX_TOKENS - MAX_NEW_TOKENS

def build_faiss_index(chunks):
    vectors = embedder.encode(chunks, convert_to_numpy=True)
    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)
    return index, vectors

def retrieve(query, index, chunks, top_k=3):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_vec, top_k)
    return [chunks[i] for i in indices[0]]

# ----------------------------
# 3. TOKEN-SAFE CONTEXT BUILDER
# ----------------------------
def build_context_token_safe(retrieved_chunks, chat_history, tokenizer, history_limit=3):
    context_tokens = 0
    context_text = ""

    # 1) Add retrieved chunks safely
    for chunk in retrieved_chunks:
        chunk_ids = tokenizer.encode(chunk, add_special_tokens=False)
        if context_tokens + len(chunk_ids) > MAX_INPUT_TOKENS - 50:  # reserve 50 for instructions
            remaining = MAX_INPUT_TOKENS - 50 - context_tokens
            if remaining > 0:
                context_text += tokenizer.decode(chunk_ids[:remaining], skip_special_tokens=True)
            break
        context_text += chunk + "\n"
        context_tokens += len(chunk_ids)

    # 2) Add recent chat history safely
    history_text = ""
    for q, a in chat_history[-history_limit:]:
        qa_text = f"Q: {q}\nA: {a}\n"
        qa_ids = tokenizer.encode(qa_text, add_special_tokens=False)
        if context_tokens + len(qa_ids) > MAX_INPUT_TOKENS - 50:
            break
        history_text += qa_text
        context_tokens += len(qa_ids)

    return history_text, context_text

def truncate_prompt_to_budget(prompt, tokenizer, max_input_tokens):
    ids = tokenizer.encode(prompt, add_special_tokens=False)
    if len(ids) <= max_input_tokens:
        return prompt
    return tokenizer.decode(ids[:max_input_tokens], skip_special_tokens=True)

# ----------------------------
# 4. QA MODEL
# ----------------------------
qa_model = pipeline("text2text-generation", model="google/flan-t5-base", device=0)

def answer_question_token_safe(query, index, chunks, history_limit=2, top_k=3):
    retrieved_chunks = retrieve(query, index, chunks, top_k=top_k)
    history_text, context_text = build_context_token_safe(retrieved_chunks, chat_history, tokenizer, history_limit=history_limit)

    prompt = f"""
You are a helpful assistant.
Previous conversation:
{history_text}
Answer the question ONLY using the context below.
If the answer is not in the context, say "I don't know from the document."

Context:
{context_text}

Question: {query}
"""
    prompt = truncate_prompt_to_budget(prompt, tokenizer, MAX_INPUT_TOKENS)

    response = qa_model(prompt, max_new_tokens=MAX_NEW_TOKENS, do_sample=False)[0]["generated_text"]

    chat_history.append((query, response))
    return response

# ----------------------------
# 5. USAGE EXAMPLE
# ----------------------------
pdf_text = load_pdf("sample.pdf")
chunks = chunk_text(pdf_text)
index, _ = build_faiss_index(chunks)

queries = [
    "What dataset was used?",
    "What problem do Sparse Feature Networks aim to solve?",
    "How does that relate to the dataset?"
]

for q in queries:
    print(answer_question_token_safe(q, index, chunks))


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


Legacy Survey website(Dey et al. 2019). Our SDSS sample comprises 250,207 galaxies in total. Our SFNet implementation is shown in Figure 1. We use a CNN architecture identical to the resnet18 (He et al. 2015), except that we insert a top- koperation be- fore the final linear layer. This additional top- klayer guarantees that each prediction is a linear combination ofksparse features. We use d= 512 latent features (af- ter the average pooling layer) and
I don't know from the document.
We surmise that regular CNN features are not in- terpretable. Meanwhile, our SFNet is both performant and interpretable. Finally, we check whether the SFNet can capture in- formation not found within the 512-dimensional Zoobot feature vector. We optimize a ridge regression model to predict SFNet feature values using the Zoobot features, finding R2values of 0 .253 and 0 .107 for the two primary SFNet features, Z 61andZ 256 , respectively. These low coefficients of determination suggest that our SFNet ex
