#### Install langchain chroma for storage vector

In [None]:
!pip install -qU langchain-chroma

In [None]:
!pip install beautifulsoup4

In [None]:
!pip install selenium

In [None]:
!pip install unstructured

#### Create a RAG System To Test

In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
from langchain_ollama import ChatOllama

In [None]:
## Setting up the Ollama LLM
llm = ChatOllama(
    model="deepseek-r1:8b",
    base_url="http://localhost:11434",
    temperature=0.5,
    num_predict=500
)

In [None]:
## Loading data from web page
loader = SeleniumURLLoader(urls=["https://www.descope.com/learn/post/mcp"])
data = loader.load()
## Splitting the data into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=100, 
                                               separators=["\n\n", "\n", ". ", " ", ""])
splits = text_splitter.split_documents(data)
## Creating embeddings and vector store
embedding = OllamaEmbeddings(model="llama3.2:latest")
vector_db = Chroma.from_documents(splits, embedding)
## Setting up retrieval augmented generation
retriever = vector_db.as_retriever(search_kwargs={"k": 3})
## Creating prompt template
template = """
    Answer the question based only on following context:
    {context}
    Give a summary not the full details
    Question: {question}
    Instructions:
        - Answer based ONLY on the information provided in the context above
        - If the context doesn't contain enough information to answer the question, say so
        - Be accurate and precise in your response
        - Give a summary, not full details
"""
prompt = ChatPromptTemplate.from_template(template)
## Function to format documents
def format_docs(docs: List[Document]) -> str:
    return "\n\n".join([doc.page_content for doc in docs])
## Function to retrieve and format documents based on question
def retrieve_and_format(question: str) -> List[Document]:
    docs = retriever.invoke(question)
    return format_docs(docs)
## Function to retrieve documents based on question
def retrieve_documents(question: str) -> List[str]:
    docs = retriever.invoke(question)
    return [doc.page_content for doc in docs]
## Final runnable chain
rag_chain = {"context": retrieve_and_format, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()

In [None]:
from deepeval.models import OllamaModel
local_llm = OllamaModel(
    model="deepseek-r1:8b",
    base_url="http://localhost:11434",
    temperature=0.5,
    generation_kwargs={"num_predict": 500}
)


#### Prepare test data

In [None]:
test_data = [
    {
        "input": "What is MCP",
        "expected_output": 
            """
            MCP, or Model Context Protocol, is a protocol used by an MCP client like Claude Desktop.
            It connects AI applications (like Claude Desktop) to external tools and resources.
            It builds upon the standard function calling method used by large language models (LLMs) to call APIs, aiming to simplify development and provide a more consistent way for AI apps to access external context.
            The process involves a handshake, capability discovery, and registration.
            """
    },
    {
        "input": "What is relationship between function calling and Model context protocol",
        "expected_output": 
            """
            The Model Context Protocol (MCP) builds upon function calling, which is the standard method for LLMs to call APIs.
            However, MCP enhances this by standardizing how tools (functions) are specified, discovered, and executed across different AI systems.
            This standardization allows for plug-and-play integration without custom code, addressing limitations like excessive maintenance and fragmentation found with traditional function calling approaches tied to specific vendors or ecosystems.
            """
    },
    {
        "input": "What is the Architecture of MCP",
        "expected_output": 
            """
            The architecture of MCP involves an **MCP client** (like Claude Desktop) interacting with **MCP servers**. The process includes:
                1.  **Protocol Handshake:** The client connects to the configured MCP servers.
                2.  **Capability Discovery:** The client queries the server for its available tools, resources, and prompts.
                3.  **Registration:** The client registers the discovered capabilities, making them available for the AI to use.
            This architecture connects AI applications (clients) to external tools and resources (servers) via a standardized method (the protocol), building upon the concept of function calling to simplify development.
            """
    }
]

#### Creating Golden dataset

In [None]:
from deepeval.dataset import Golden, EvaluationDataset

goldens = []

for data in test_data:
    golden = Golden(
        input=data["input"],
        expected_output=data["expected_output"]
    )
    goldens.append(golden)

data_set = EvaluationDataset(goldens=goldens)
data_set

#### Pushing Dataset to Confident AI

In [None]:
data_set.push(alias="test_qa_dataset", finalized=True)

#### Pulling from Confident AI Dataset

In [None]:
cloud_dataset = EvaluationDataset()
cloud_dataset.pull(alias="test_qa_dataset")
print(cloud_dataset)

#### Prepare `actual_output` and `retrieval_context`

In [None]:
from langchain_classic.chains import RetrievalQA
## Its is going to use the LLM and Vector Database stored information (RAG)
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
actual_output = qa_chain.invoke("What is MCP")
actual_output

In [None]:
def get_actual_output_with_context(question: str):
    actual_output = qa_chain.run(question)
    retrieved_document = retrieve_documents(question)
    return actual_output, retrieved_document

In [None]:
actual_output, retrieved_context = get_actual_output_with_context("What is MCP")
print(f"###ACTUAL OUTPUT### /n:{actual_output}/n")
print(f"###RETRIEVAL CONTEXT### /n: {retrieved_context}")

#### Creating LLM Test Case with Goldens

In [None]:
from deepeval.dataset import Golden
from deepeval.test_case import LLMTestCase
from typing import List
def convert_golden_to_testCase(goldens: List[Golden]) -> List[LLMTestCase]:
    test_cases = []
    for golden in goldens:
        actual_output, retrieved_context = get_actual_output_with_context(golden.input)
        test_case = LLMTestCase(
            input = golden.input,
            actual_output = actual_output,
            expected_output = golden.expected_output,
            retrieval_context = retrieved_context
        )
        test_cases.append(test_case)
    return test_cases
data = convert_golden_to_testCase(cloud_dataset.goldens)
data

#### Final Evaluation

In [None]:
import deepeval.metrics
from deepeval.config import settings
settings.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE = "800"
deepeval.evaluate(
    data,
    metrics=[
        deepeval.metrics.AnswerRelevancyMetric(model=local_llm),
        deepeval.metrics.FaithfulnessMetric(model=local_llm),
        deepeval.metrics.ContextualPrecisionMetric(model=local_llm),
        deepeval.metrics.ContextualRelevancyMetric(model=local_llm)
    ]
)