### Loading llm model

In [1]:
import torch
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [2]:
# Step 1: Set the model name
model_name = "google/gemma-2b-it"
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 2: Load tokenizer and fix padding side (Gemma requires left-padding for generation)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Gemma needs left padding for batched generation

# Step 3: Load model directly onto GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",       # or omit this if issues arise
    device_map=None           # Don't use device_map on CPU

)


pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    top_p=1.0,
    do_sample=False       # Disable sampling; forces greedy decoding

)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


### Loading the embedding model

In [3]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer




In [4]:

embedding_fn = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5"
)

vectorstore = Chroma(
    client=chromadb.PersistentClient(path="./chroma_bge_768"),
    collection_name="qnotes_docs",
    embedding_function=embedding_fn
)


  embedding_fn = HuggingFaceEmbeddings(
  vectorstore = Chroma(


In [5]:
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
llm = HuggingFacePipeline(pipeline=pipe)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vectorstore.as_retriever())

  llm = HuggingFacePipeline(pipeline=pipe)


### Retriver using prompt template

In [6]:
import sys, os, pathlib
sys.path.insert(0, os.path.abspath("src"))        # points to ./src relative to the notebook

from src.quantum_router import pick_quantum_template

In [7]:
def retriever(user_query, vectorstore, sample_no = 1):

    # Use your vectorstore to get context
    retrieved_docs = vectorstore.similarity_search(user_query, k=7)

    retrieved_context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    template_fn = pick_quantum_template(user_query)

    prompt = template_fn(retrieved_context, user_query)

    
    # Run with Gemma or Mistral
    output = pipe(prompt)[0]['generated_text']

    answer = output.split("Answer (structured and accurate):")[-1].strip()
    context = output.split("Context:")[-1].strip().split("Question:")[0].strip()
    question = output.split("Question:")[-1].strip().split("Answer (structured and accurate):")[0].strip()
    
    result = {
        "question": question,
        "context": context,
        "answer": answer
    }
    
    print("sample"+str(sample_no))
    print("question: ", result["question"])
    print("answer: ", result["answer"])
    print()
    return result

In [8]:
qs = ['What are three reasons to study quantum computers?',
 'What is the purpose of the Hadamard gate in quantum computing?',
 'What is the standard form of an EPR-pair? Also include equation',
 'Who proposed the first efficient quantum algorithm for factoring, and in what year?',
 'What is the role of the quantum circuit model in computation?',
 'How can an EPR-pair simulate a public coin toss?',
 'What is the function of the Toffoli gate, and why is it important?',
 'What does the Quantum Fourier Transform do in phase estimation?',
 'What is the difference between the quantum Turing machine and the quantum circuit model?',
 'Why did Feynman propose the idea of quantum computers?']

In [18]:
for i, q in enumerate(qs):
    retriever(q, vectorstore, sample_no = i)
    print('-------------------------------------------------------------')

sample0
question:  What are three reasons to study quantum computers?
answer:  1. The process of miniaturization that has made current classical computers so powerful and cheap, has already reached micro-levels where quantum eﬀects occur.
2. Making use of quantum eﬀects allows one to speed up certain computations enormously (sometimes exponentially), and even enables some things that are impossible for classical computers.
3. The main goal of theoretical computer science is to “study the power and limitations of the strongest-possible computational devices that Nature allows us.”

-------------------------------------------------------------
sample1
question:  What is the purpose of the Hadamard gate in quantum computing?
answer:  The purpose of the Hadamard gate in quantum computing is to apply a quantum operation on each qubit in the register, resulting in a superposition of all n-bit strings. This allows quantum algorithms to explore a vast number of different possibilities simultan