In [2]:
!pip install giskard openai ragas sentence_transformers --upgrade scikit-learn hdbscan  giskard[llm]

Collecting giskard
  Downloading giskard-2.15.5-py3-none-any.whl.metadata (15 kB)
Collecting ragas
  Downloading ragas-0.2.5-py3-none-any.whl.metadata (8.0 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.3.0-py3-none-any.whl.metadata (10 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting zstandard>=0.10.0 (from giskard)
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting mlflow-skinny>=2 (from giskard)
  Downloading mlflow_skinny-2.17.2-py3-none-any.whl.metadata (30 kB)
Collecting scipy<1.12.0,>=1.7.3 (from giskard)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mixpanel>=4.4.0 (from giskard)
  Downloading mixpanel-4.

In [11]:
!pip install --upgrade giskard





In [12]:
import giskard
from typing import Sequence, Optional
from giskard.llm.client import set_default_client
from giskard.llm.client.base import LLMClient, ChatMessage
from openai import OpenAI

from google.colab import userdata
xai_key = userdata.get('XAI')



# Create a custom client by extending the LLMClient class
class MyLLMClient():

    def complete(
            self,
            messages: Sequence[ChatMessage],
            temperature: float = 1,
            max_tokens: Optional[int] = None,
            caller_id: Optional[str] = None,
            seed: Optional[int] = None,
            format=None,
    ) -> ChatMessage:

        XAI_API_KEY = xai_key
        client = OpenAI(
            api_key=XAI_API_KEY,
            base_url="https://api.x.ai/v1",
        )

        prompt_message = [
                {"role": "system", "content": "You are Grok, a medical chatbot"},
            ]

        for msg in messages:
            if msg.role.lower() == "assistant":
                prompt_message.append({
                    "role": "assistant", "content": msg.content
                })
            else:
                 prompt_message.append({
                    "role": "user", "content": msg.content
                })


        completion = client.chat.completions.create(
            model="grok-beta",
            messages=prompt_message,
            temperature= 0.25
        )

        ans = completion.choices[0].message.content

        return ChatMessage(role="assistant", content=ans)


llm_client = MyLLMClient()

# Set the default client
set_default_client(llm_client)

In [11]:
!which python


/usr/local/bin/python


In [2]:
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs,
    query_instruction="Represent this sentence for searching relevant passages:"
)

embeddings.query_instruction = "Represent this sentence for searching relevant passages:"


In [None]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader


medrag_textbook_loader = HuggingFaceDatasetLoader("MedRAG/textbooks", "contents")

In [None]:
medrag_textbook_loader_data = medrag_textbook_loader.load()

In [None]:
type(medrag_textbook_loader_data)

In [None]:
!pip install --upgrade scipy

In [None]:
medrag_textbook_data =[]
for row in medrag_textbook_loader_data:
    medrag_textbook_data.append(row.page_content)

In [3]:
from giskard.llm.embeddings import BaseEmbedding
class EmbeddingsWrapper(BaseEmbedding):
    def __init__(self, embeddings):
        self.embeddings = embeddings

    def embed(self, texts):
        return self.embeddings.embed_documents(texts)

wrapped_embeddings = EmbeddingsWrapper(embeddings)

In [None]:
import random

small_medrag_textbook_data = random.sample(medrag_textbook_data, 5000)
print(len(small_medrag_textbook_data))

5000


In [None]:
import pandas as pd
medrag_textbook_loader_df = pd.DataFrame(small_medrag_textbook_data, columns=["page_content"])

In [None]:
medrag_textbook_loader_df.head()

Unnamed: 0,page_content
0,"""Cell_Biology_Alberts. Figure 13\u201346 The r..."
1,"""Physiology_Levy. 12. How does the organizatio..."
2,"""Immunology_Janeway. and cytoplasmic segments ..."
3,"""Pediatrics_Nelson. Phimosis is rarely symptom..."
4,"""Immunology_Janeway. was blocked. Normal numbe..."


In [None]:
from giskard.rag import generate_testset, KnowledgeBase

knowledge_base = KnowledgeBase.from_pandas(medrag_textbook_loader_df, columns=["id","page_content"], embedding_model=wrapped_embeddings)


In [None]:
testset = generate_testset(
    knowledge_base,
    num_questions=550,
    language='en',  # optional, we'll auto detect if not provided
    agent_description="A medical chatbot", # helps generating better questions
)

Generating questions:   0%|          | 0/550 [00:00<?, ?it/s]

In [None]:
# Save the generated testset
testset.save("my_testset.jsonl")

# You can easily load it back
from giskard.rag import QATestset

loaded_testset = QATestset.load("my_testset.jsonl")

In [None]:
test_df = loaded_testset.to_pandas()


In [None]:
test_df.head()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bafdc266-3a82-4ddf-9484-08c7a40d747b,What is the smallest detail that can be resolv...,The light microscope can resolve details 0.2 μ...,"Document 434: page_content: ""Cell_Biology_Albe...",[],"{'question_type': 'simple', 'seed_document_id'..."
d53903b3-7a45-436d-82e1-956404718cc1,What should be considered for patients with ch...,Patients who do not respond to conventional th...,"Document 4390: page_content: ""InternalMed_Harr...",[],"{'question_type': 'simple', 'seed_document_id'..."
c0c48ffe-998c-4237-952c-c73b6912746c,Can heparin treatment for deep-vein thrombosis...,"Yes, full-dose heparin can be restarted severa...","Document 2207: page_content: ""Obstentrics_Will...",[],"{'question_type': 'simple', 'seed_document_id'..."
61c2056d-fb20-4f56-9002-3c2c01f76fef,What is the function of BTLA when it and HVEM ...,When BTLA and HVEM are co-expressed on the sam...,"Document 517: page_content: ""Immunology_Janewa...",[],"{'question_type': 'simple', 'seed_document_id'..."
83e28e86-fc81-4180-bfd4-c424816d60f5,What is the principal clinical usefulness of H...,The principal clinical usefulness of HBeAg in ...,"Document 2933: page_content: ""InternalMed_Harr...",[],"{'question_type': 'simple', 'seed_document_id'..."


In [None]:
excluded_docs = set(small_medrag_textbook_data)


In [None]:
import random
remaining_docs = [doc for doc in medrag_textbook_data if doc not in excluded_docs]

small_medrag_textbook_data2 = random.sample(remaining_docs, 5000)
print(len(small_medrag_textbook_data2))

5000


In [None]:
import pandas as pd
medrag_textbook_loader_df_2 = pd.DataFrame(small_medrag_textbook_data2, columns=["page_content"])

In [None]:
from giskard.rag import generate_testset, KnowledgeBase

knowledge_base_2 = KnowledgeBase.from_pandas(medrag_textbook_loader_df_2, columns=["id","page_content"], embedding_model=wrapped_embeddings)


In [None]:
testset = generate_testset(
    knowledge_base_2,
    num_questions=550,
    language='en',  # optional, we'll auto detect if not provided
    agent_description="A medical chatbot", # helps generating better questions
)

Generating questions:   0%|          | 0/550 [00:00<?, ?it/s]

In [None]:
# Save the generated testset
testset.save("my_testset2.jsonl")

# You can easily load it back
from giskard.rag import QATestset

loaded_testset = QATestset.load("my_testset2.jsonl")

In [None]:
test_df.iloc[2]

question                Can heparin treatment for deep-vein thrombosis...
reference_answer        Yes, full-dose heparin can be restarted severa...
reference_context       Document 2207: page_content: "Obstentrics_Will...
conversation_history                                                   []
metadata                {'question_type': 'simple', 'seed_document_id'...
Name: c0c48ffe-998c-4237-952c-c73b6912746c, dtype: object

In [None]:
test_df.shape[0]

550

# Pubmed RAG eval synthetic data generator

In [13]:
from langchain_community.document_loaders import HuggingFaceDatasetLoader


medrag_textbook_loader = HuggingFaceDatasetLoader("MedRAG/pubmed", "contents")

In [4]:
from datasets import load_dataset

medrag_textbook_loader_2 = load_dataset("MedRAG/pubmed", split='train', streaming=True)  # Adjust the split if needed


Resolving data files:   0%|          | 0/1166 [00:00<?, ?it/s]

In [5]:
medrag_textbook_loader_3 = medrag_textbook_loader_2.take(1000000)

In [6]:
import ragas

In [7]:
import random

small_medrag_pubmed_data = random.sample(list(medrag_textbook_loader_3), 2000)

In [13]:
medrag_pubmed_data =[]
for row in small_medrag_pubmed_data:
    medrag_pubmed_data.append(row["contents"])

In [14]:
import pandas as pd
medrag_pubmed_loader_df = pd.DataFrame(medrag_pubmed_data, columns=["page_content"])

In [15]:
from giskard.rag import generate_testset, KnowledgeBase

knowledge_base = KnowledgeBase.from_pandas(medrag_pubmed_loader_df, columns=["id","page_content"], embedding_model=wrapped_embeddings)


In [16]:
testset = generate_testset(
    knowledge_base,
    num_questions=550,
    language='en',  # optional, we'll auto detect if not provided
    agent_description="A medical chatbot", # helps generating better questions
)

INFO:giskard.rag:Finding topics in the knowledge base.
INFO:giskard.rag:Found 70 topics in the knowledge base.


Generating questions:   0%|          | 0/550 [00:00<?, ?it/s]

In [17]:
# Save the generated testset
testset.save("my_testset3.jsonl")

# You can easily load it back
from giskard.rag import QATestset

loaded_testset = QATestset.load("my_testset3.jsonl")

In [20]:
small_medrag_pubmed_data[0]["id"]

'pubmed23n0050_2455'

In [21]:
excluded_docs = set([row["id"] for row in small_medrag_pubmed_data])



In [23]:
pubmed_list = list(medrag_textbook_loader_3)

In [13]:
def jsonl_to_list(json_list):
    my_testset = []
    for json_str in json_list:
        result = json.loads(json_str)
        my_testset.append(result)
    return my_testset

In [14]:
import json

with open("my_testset.jsonl", "r", encoding="utf-8") as file:
    my_testset_list = list(file)
    
my_testset1 = jsonl_to_list(my_testset_list) 


with open("my_testset2.jsonl", "r", encoding="utf-8") as file:
    my_testset2_list = list(file)

my_testset2 = jsonl_to_list(my_testset2_list)

with open("my_testset3.jsonl", "r", encoding="utf-8") as file:
    my_testset3_list = list(file)

my_testset3 = jsonl_to_list(my_testset3_list) 


testset_rag_eval_synthetic = []
testset_rag_eval_synthetic.extend(my_testset1)
testset_rag_eval_synthetic.extend(my_testset2)
testset_rag_eval_synthetic.extend(my_testset3)

In [16]:
print(len(testset_rag_eval_synthetic))
testset_rag_eval_synthetic[0]

1650


{'id': 'bafdc266-3a82-4ddf-9484-08c7a40d747b',
 'question': 'What is the smallest detail that can be resolved by a light microscope?',
 'reference_answer': 'The light microscope can resolve details 0.2 μm apart.',
 'reference_context': 'Document 434: page_content: "Cell_Biology_Alberts. The images in Figure 9\\u20131 illustrate a stepwise progression from a thumb to a cluster of atoms. Each successive image represents a tenfold increase in magnification. The naked eye can see features in the first two panels, the light microscope allows us to see details corresponding to about the fourth or fifth panel, and the electron microscope takes us to about the seventh or eighth panel. Figure 9\\u20132 shows the sizes of various cellular and subcellular structures and the ranges of size that different types of microscopes can visualize. Looking at CeLLs in the Light MiCrosCope Looking at CeLLs anD MoLeCULes in the eLeCtron MiCrosCope 20 mm 2 mm 0.2 mm 20 \\u00b5m 2 \\u00b5m 0.2 \\u00b5m 20 nm 2

In [17]:
with open("rag_eval_synthetic_testset.jsonl", 'w') as out:
    for ddict in testset_rag_eval_synthetic:
        jout = json.dumps(ddict) + '\n'
        out.write(jout)