In [13]:
!pip install PyPDF2 --quiet


In [14]:
# import requests, os

# docs = {
#     "bitcoin_whitepaper.pdf": "https://bitcoin.org/bitcoin.pdf",
#     "attention_is_all_you_need.pdf": "https://arxiv.org/pdf/1706.03762.pdf",
#     "fifa_2022_report.pdf": "https://digitalhub.fifa.com/m/4b702fcd50dbe84/original/FIFA-World-Cup-Qatar-2022-TM-Technical-Report.pdf",
#     "biodiversity_outlook.pdf": "https://www.cbd.int/gbo/gbo5/publication/gbo-5-en.pdf",
#     "unesco_heritage.pdf": "https://unesdoc.unesco.org/ark:/48223/pf0000385535",
#     "constitution_of_india.pdf": "https://www.india.gov.in/sites/upload_files/npi/files/coi_part_full.pdf",
#     "imf_world_economic_outlook.pdf": "https://www.imf.org/en/Publications/WEO/Issues/2024/04/16/world-economic-outlook-april-2024",
#     "time_machine.txt": "https://www.gutenberg.org/files/35/35-0.txt",
#     "annihilation_of_caste.pdf": "https://www.marxists.org/reference/archive/ambedkar/1936/annihilation-caste.pdf"
# }

# os.makedirs("corpus", exist_ok=True)
# for name, url in docs.items():
#     r = requests.get(url)
#     with open(os.path.join("corpus", name), "wb") as f:
#         f.write(r.content)
# print("✅ Documents downloaded to ./corpus/")


In [15]:
CORPUS_DIR = Path("/home/harshal/Documents/Nityo_Challanges/corpus")   # e.g., where you saved bitcoin_whitepaper.pdf etc.


### Cell 2 — Config (CORPUS ONLY)

In [16]:
from pathlib import Path
from typing import List, Tuple, Iterable
import fnmatch

# ---------- Path to your corpus ----------
CORPUS_DIR = Path("/home/harshal/Documents/Nityo_Challanges/corpus")   # e.g., where you saved bitcoin_whitepaper.pdf etc.
EXTS: Tuple[str, ...] = (".pdf", ".txt")
RECURSIVE = True

def iter_files(root: Path, exts: Tuple[str, ...], recursive: bool) -> Iterable[Path]:
    if not root.exists():
        raise FileNotFoundError(f"CORPUS_DIR not found: {root}")
    it = root.rglob("*") if recursive else root.glob("*")
    for p in it:
        if p.is_file() and p.suffix.lower() in exts:
            yield p

# Toggle: take all or specific files
USE_ALL = True
NAMES: List[str] = [
    # "bitcoin_whitepaper.pdf",
    # "attention_is_all_you_need.pdf",
]

def resolve_from_corpus(names: List[str]) -> List[Path]:
    out = []
    for item in names:
        if any(ch in item for ch in ["*", "?", "["]):
            for cand in (CORPUS_DIR.rglob("*") if RECURSIVE else CORPUS_DIR.glob("*")):
                if cand.is_file() and fnmatch.fnmatch(cand.name, item) and cand.suffix.lower() in EXTS:
                    out.append(cand)
        else:
            cand = CORPUS_DIR / item
            if cand.exists() and cand.suffix.lower() in EXTS:
                out.append(cand)
    # de-dup
    seen, dedup = set(), []
    for p in out:
        if p not in seen:
            dedup.append(p); seen.add(p)
    return dedup

inputs = list(iter_files(CORPUS_DIR, EXTS, RECURSIVE)) if USE_ALL else resolve_from_corpus(NAMES)

print(f"Selected {len(inputs)} file(s):")
for i, p in enumerate(sorted(inputs), 1):
    print(f"{i:>2}. {p.name}")


Selected 9 file(s):
 1. annihilation_of_caste.pdf
 2. attention_is_all_you_need.pdf
 3. biodiversity_outlook.pdf
 4. bitcoin_whitepaper.pdf
 5. constitution_of_india.pdf
 6. fifa_2022_report.pdf
 7. imf_world_economic_outlook.pdf
 8. time_machine.txt
 9. unesco_heritage.pdf


In [17]:
# # Debugg
# # Inspect the first 2 items to ensure docs == List[{"text": str, "meta": {...}}]
# print(type(docs), len(docs))
# for i, d in enumerate(docs[:2]):
#     print(i, type(d), list(d.keys()) if isinstance(d, dict) else "not-a-dict")


### Cell 3 — Loaders (PDF/TXT → text)

In [18]:
from typing import Tuple
from PyPDF2 import PdfReader

def load_pdf(fp: Path) -> str:
    try:
        reader = PdfReader(str(fp))
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception:
        return ""

def load_txt(fp: Path) -> str:
    for enc in ("utf-8", "utf-16", "latin-1"):
        try:
            return fp.read_text(encoding=enc, errors="ignore")
        except Exception:
            continue
    return ""

def load_any(fp: Path) -> Tuple[str, dict]:
    if fp.suffix.lower() == ".pdf":
        text = load_pdf(fp)
    else:
        text = load_txt(fp)
    return text, {"source": str(fp)}

def build_docs(file_paths):
    out = []
    for fp in file_paths:
        text, meta = load_any(fp)
        if text.strip():
            out.append({"text": text, "meta": meta})
    return out

docs = build_docs(inputs)

print("Loaded", len(docs), "docs")
for i, d in enumerate(docs[:3]):
    print(i, "source:", d["meta"]["source"].split("/")[-1])


Loaded 4 docs
0 source: bitcoin_whitepaper.pdf
1 source: time_machine.txt
2 source: attention_is_all_you_need.pdf


### Cell 4 — Chunking

In [19]:
from typing import List, Dict

def chunk_text(text: str, max_tokens: int = 800, overlap: int = 150) -> List[str]:
    if not isinstance(text, str):
        text = str(text or "")
    words = text.split()
    chunks, i = [], 0
    step = max(max_tokens - overlap, 1)
    while i < len(words):
        chunks.append(" ".join(words[i:i+max_tokens]))
        i += step
    return chunks

corpus_chunks: List[Dict] = []
for d in docs:
    text = d.get("text", "")
    source = d.get("meta", {}).get("source", "unknown")
    for idx, ch in enumerate(chunk_text(text, 800, 150)):
        if ch.strip():
            corpus_chunks.append({"text": ch, "source": source, "chunk_id": idx})

print("Total chunks:", len(corpus_chunks))


Total chunks: 221


### Cell 5 — Vector Store (TF-IDF Retrieval)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

texts = [c["text"] for c in corpus_chunks]
# ngram bigrams help with “multi head”, “self attention”, etc.
vectorizer = TfidfVectorizer(
    max_features=120_000,
    stop_words="english",
    ngram_range=(1,2)
)
X = vectorizer.fit_transform(texts)

def retrieve(query: str, k: int = 5):
    qv = vectorizer.transform([query])
    sims = cosine_similarity(qv, X)[0]
    idxs = np.argsort(-sims)[:k]
    results = []
    for rank, ix in enumerate(idxs, 1):
        c = corpus_chunks[ix]
        results.append({
            "rank": rank,
            "score": float(sims[ix]),
            "text": c["text"],
            "source": c["source"],
            "chunk_id": c["chunk_id"]
        })
    return results


### Cell 6 — Answer Synthesizer (+ LLM hook placeholder)

In [1]:
import re
from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-Z(])')

# debugg_3("synthesize_context", used=len(used), ctx_chars=len(context))

def sent_tokenize(text: str) -> List[str]:
    # very simple sentence splitter; good enough for PDFs
    sents = _SENT_SPLIT.split(text.strip())
    # clean tiny fragments
    return [s.strip() for s in sents if len(s.strip()) > 25]

def answer_from_context(query: str, context: str, max_sents: int = 5) -> str:
    sents = sent_tokenize(context)
    if not sents:
        return ""
    # Rank sentences by TF-IDF cosine to query
    vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=40_000)
    mat = vec.fit_transform(sents + [query])


# print("debugg_4: query received ->", query)    
    qv = mat[-1]
    S = mat[:-1]
    sims = linear_kernel(S, qv)[:,0]
    top_idx = np.argsort(-sims)[:max_sents]
    chosen = [sents[i] for i in top_idx]
    # Keep original order among chosen sentences (optional)
    order = {i:pos for pos,i in enumerate(sorted(top_idx))}
    chosen_sorted = [sents[i] for i in sorted(top_idx)]
    answer = " ".join(chosen_sorted)
    # small cleanup
    return re.sub(r'\s+', ' ', answer).strip()

def synthesize_answer(query: str, retrieved: List[Dict], max_context_chars: int = 6000) -> Dict:
    context = ""
    used = []
    for r in retrieved:
        snippet = r["text"]
        if len(context) + len(snippet) + 2 <= max_context_chars:
            context += "\n\n" + snippet
            used.append({"source": r["source"], "chunk_id": r["chunk_id"], "score": r["score"]})
        else:
            break

    if not context.strip():
        answer = ""
    else:
        answer = answer_from_context(query, context, max_sents=5)

    return {"answer": answer, "context": context.strip(), "used": used}

def llm_answer(query: str, context: str) -> str | None:
    # Keep as None for now (pure extractive mode).
    return None


### Cell 7 — Completeness Check & Enrichment Suggestions

In [30]:
# debugg_4("completeness", best=retrieved[0]["score"] if retrieved else None)

def completeness_check(query: str, retrieved: List[Dict]) -> dict:
    if not retrieved:
        return {"confidence": 0.0, "missing_info": ["No relevant passages found."]}
    # use best score as signal; TF-IDF with bigrams tends to produce 0.2–0.6 for good hits
    best = retrieved[0]["score"]
    missing = []
    if best < 0.12:
        missing.append("Low overlap with the question; add topic-specific documents.")
    return {"confidence": round(float(best), 3), "missing_info": missing}

def enrichment_suggestions(query: str, retrieved: List[Dict]) -> list:
    q = query.lower()
    sugg = []
    if any(k in q for k in ["bitcoin", "satoshi", "block"]):
        sugg.append("Ensure the Bitcoin whitepaper and core protocol primers are included.")
    if any(k in q for k in ["attention", "transformer", "multi-head"]):
        sugg.append("Include 'Attention Is All You Need' and a tutorial on multi-head attention.")
    if any(k in q for k in ["biodiversity", "ecosystem"]):
        sugg.append("Include UN CBD/IPBES biodiversity reports and regional assessments.")
    if any(k in q for k in ["fifa", "football"]):
        sugg.append("Include FIFA technical reports and match analyses.")
    if any(k in q for k in ["constitution", "preamble"]):
        sugg.append("Include the Constitution text and annotated commentaries.")
    if not sugg:
        sugg.append("Add more primary sources (official PDFs/manuals) near your topic.")
    return sugg
# debugg_4("enrichment", suggestions=len(sugg))

### Cell 8 — Orchestrator ask() → Structured JSON

In [31]:
import json

def ask(question: str, k: int = 5) -> dict:
    retrieved = retrieve(question, k=k)
    synth = synthesize_answer(question, retrieved)
    audit = completeness_check(question, retrieved)
    suggestions = enrichment_suggestions(question, retrieved)

    llm_out = llm_answer(question, synth["context"])
    final_answer = llm_out if llm_out else synth["answer"]

    output = {
        "answer": final_answer,
        "confidence": audit["confidence"],
        "missing_info": audit["missing_info"],
        "enrichment_suggestions": suggestions,
        "sources": [{"source": r["source"], "chunk_id": r["chunk_id"], "score": r["score"]} for r in retrieved]
    }
    print(json.dumps(output, indent=2))
    return output


In [35]:
_ = ask("What is the core idea of the Bitcoin whitepaper?")
_ = ask("Explain multi-head attention from 'Attention Is All You Need'.")
_ = ask("What are the major biodiversity threats discussed?")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "answer": "<|system|>\nYou are an intelligent assistant answering only from the given context. If the context lacks details, clearly say 'Information insufficient.'\n<|user|>\nQuestion:\nWhat is the core idea of the Bitcoin whitepaper?\n\nContext:\nBitcoin: A Peer-to-Peer Electronic Cash System Satoshi Nakamoto satoshin@gmx.com www.bitcoin.org Abstract. A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{
  "answer": "<|system|>\nYou are an intelligent assistant answering only from the given context. If the context lacks details, clearly say 'Information insufficient.'\n<|user|>\nQuestion:\nExplain multi-head attention from 'Attention Is All You Need'.\n\nContext:\nThe output is computed as a weighted sum 3 Scaled Dot-Product Attention Multi-Head Attention Figure 2: (left) Scaled Dot-Product Attention. (right) Multi-Head Attention consists of several attention layers running in parallel. of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key. 3.2.1 Scaled Dot-Product Attention We call our particular attention \"Scaled Dot-Product Attention\" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv. We compute the dot products of the query with all keys, divide each by\u221adk, and apply a softmax function to obtain the weights on the values. In practice, we compute the

KeyboardInterrupt: 

In [None]:
# --- AI-Powered Generation Cell ---
# Integrates directly with your existing 'ask()' orchestrator.
# Requirements:
#   pip install transformers accelerate --quiet

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

USE_LLM = True  # <--- toggle to False to fall back to TF-IDF baseline

if USE_LLM:
    model_name = "Qwen/Qwen2.5-3B-Instruct"  # light, CPU-capable, open source
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )

def llm_answer(query: str, context: str) -> str | None:
    """
    Uses an LLM (e.g., Qwen2.5-3B-Instruct) to synthesize an answer from retrieved context.
    Falls back gracefully if USE_LLM = False.
    """
    if not USE_LLM or not context.strip():
        return None

    system_prompt = (
        "You are an intelligent assistant answering only from the given context. "
        "If the context lacks details, clearly say 'Information insufficient.'"
    )

    user_prompt = f"Question:\n{query}\n\nContext:\n{context}\n\nAnswer in 4-5 sentences, grounded only in the context."

    full_prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{user_prompt}"

    inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=250,
            temperature=0.2,
            do_sample=False
        )

    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Clean up output
    answer = text.split("Answer:")[-1].strip() if "Answer:" in text else text.strip()
    return answer
