In [1]:
pip install transformers accelerate bitsandbytes langchain chromadb sentence-transformers langchain_community ragas



In [2]:
from huggingface_hub import login
login("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")

In [3]:
import bs4
import torch
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

#### INDEXING ####

# Load Documents
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed using BGE-small
bge_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
vectorstore = Chroma.from_documents(documents=splits, embedding=bge_model)
retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt template from LangChain hub
prompt = hub.pull("rlm/rag-prompt")

# Load Hugging Face quantized model (4-bit)
model_id = "mistralai/Mistral-7B-Instruct-v0.1"  # Use a quant-compatible model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Create text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    do_sample=True,
)

# Wrap in LangChain LLM interface
llm = HuggingFacePipeline(pipeline=pipe)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


  bge_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [4]:
from datasets import Dataset

questions = ["What is Locality-Sensitive Hashing?",
             "What is Facebook AI Similarity Search?",
             "What is Hierarchical Navigable Small World?",
            ]
ground_truths = [["Locality-Sensitive Hashing: It introduces a hashing function such that similar input items are mapped to the same buckets with high probability, where the number of buckets is much smaller than the number of inputs."],
                ["Facebook AI Similarity Search: It operates on the assumption that in high dimensional space, distances between nodes follow a Gaussian distribution and thus there should exist clustering of data points."],
                ["Hierarchical Navigable Small World: It is inspired by the idea of small world networks where most nodes can be reached by any other nodes within a small number of steps; e.g. “six degrees of separation” feature of social networks. HNSW builds hierarchical layers of these small-world graphs, where the bottom layers contain the actual data points. The layers in the middle create shortcuts to speed up search"]]
answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas import evaluate
from ragas.metrics import AnswerRelevancy, Faithfulness
import torch

# 1. Load dataset
dataset = Dataset.from_dict(data)

# 2. Load your HuggingFace model and tokenizer (SLM)
model_id = "microsoft/phi-2"  # Or use any other small model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # or use float32 based on the available resources
)

# 3. Create HuggingFace pipeline (No `device` argument due to accelerate)
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.3
)

# 4. Wrap in LangChain compatible LLM
langchain_llm = HuggingFacePipeline(pipeline=text_gen_pipeline)
langchain_llm_wrapped = LangchainLLMWrapper(langchain_llm)

# 5. Load and wrap embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
langchain_embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
langchain_embeddings_wrapped = LangchainEmbeddingsWrapper(langchain_embeddings)

# 6. Initialize custom metrics with wrapped components
faithfulness = Faithfulness(llm=langchain_llm_wrapped)
answer_relevancy = AnswerRelevancy(llm=langchain_llm_wrapped, embeddings=langchain_embeddings_wrapped)

# 7. Run Evaluation
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy],
    llm=langchain_llm_wrapped,
    embeddings=langchain_embeddings_wrapped
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Evaluating:   0%|          | 0/6 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Token indices sequence leng

In [7]:
results.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,faithfulness,answer_relevancy
0,What is Locality-Sensitive Hashing?,[LSH (Locality-Sensitive Hashing): It introduc...,Human: You are an assistant for question-answe...,,
1,What is Facebook AI Similarity Search?,[FAISS (Facebook AI Similarity Search): It ope...,Human: You are an assistant for question-answe...,,
2,What is Hierarchical Navigable Small World?,[HNSW (Hierarchical Navigable Small World): It...,Human: You are an assistant for question-answe...,,
