In [1]:
import pandas as pd
testset = pd.read_csv("testset_rag_eval.csv")

In [8]:
!pip install  InstructorEmbedding   duckduckgo-search langchain-core   huggingface_hub  sentence-transformers langchain-text-splitters  langchain-chroma    langchain_community  google-generativeai  langchain-google-genai    faiss-cpu

[0m

In [13]:
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_chroma import Chroma
import chromadb.utils.embedding_functions as embedding_functions
from dotenv import load_dotenv
from langchain.docstore.document import Document
import os
import google.generativeai as genai
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
import requests
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from typing import Any, List, Optional

from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain.load import dumps, loads

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
import faiss
import numpy as np
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor



ds = load_dataset("HPAI-BSC/medqa-cot")

medqa_cot_embeds = np.load("medqa_cot.npy")

index = faiss.IndexFlatIP(768)
index.add(medqa_cot_embeds)

query_model = AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder")
query_tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder")

load_dotenv()


os.environ["GEMINI_API_KEY"] = ""


model_name = "hkunlp/instructor-xl"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
instructor_embeddings = HuggingFaceInstructEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Cross-Encoder")
model = AutoModelForSequenceClassification.from_pretrained("ncbi/MedCPT-Cross-Encoder")


translate_model = genai.GenerativeModel("gemini-1.0-pro")  # buat translate


def translate_text(text, language):
    newText = text
    if language == "Indonesian":
        newText = text.replace("\n", " \n ")
    newText = (
        newText
        + f"; translate to {language}. please just translate the text and don't answer the questions!"
    )
    return translate_model.generate_content(newText).candidates[0].content.parts[0].text


summarizer_model = genai.GenerativeModel("gemini-1.5-flash-8b")


def user_summarizer(text):
    return summarizer_model.generate_content(text).candidates[0].content.parts[0].text


retriever_model = Chroma(
    collection_name="welllahh_rag_collection_chromadb",
    persist_directory="./chroma_langchain_db2",
    embedding_function=instructor_embeddings,
)

retriever = retriever_model.as_retriever(search_kwargs={"k": 10})


template = """Write multiple different very short search queries (each queries by a separated by "," & maximum different 4 very short search queries) that will help answer complex user questions, make sure in your answer you only give multiple different very short search queries (each queries by a separated by "," & maximum different 4 very short search queries) and don't include any other text! . Original question: {question}"""
search_query_prompt = ChatPromptTemplate.from_template(template)


class GeminiLLM(LLM):
    """custom model pakai gemini"""

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        llm = genai.GenerativeModel(
            model_name="gemini-1.5-flash"  # sebelumnya 0
        )  # buat jawab pertanyaan medis

        ans = (
            llm.generate_content(prompt, generation_config={"temperature": 0.12})
            .candidates[0]
            .content.parts[0]
            .text
        )
        return ans

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model. Used for logging purposes only."""
        return "gemini-welllahh-zerotemp-lrfv-3536"


llm = GeminiLLM()

generate_query = search_query_prompt | llm | StrOutputParser()

cant_access = {
    "npin.cdc.gov",
    "www.ncbi.nlm.nih.gov",
}  # gak bisa diakses & gak muncul tag <p> nya

def scrape_websearch(queryProc):
    websearch = []
    try:
        results = DDGS().text(queryProc, max_results=8)
    except Exception:
        return websearch
    for res in results:
        domain = res["href"].split("/")[2]
        if "webmd" in res["href"] or ".pdf" in res["href"] or domain in cant_access:
            continue
        if len(websearch) == 3:
            break
        if ".org" in res["href"] or ".gov" in res["href"] or "who" in res["href"]:

            link = res["href"]
            try:
                page = requests.get(link).text
            except requests.exceptions.RequestException as errh:
                print(f"error: {errh}")
                continue
            doc = BeautifulSoup(page, features="html.parser")
            text = ""
            hs = doc.find_all("h2")
            h3s = doc.find_all("h3")
            ps = doc.find_all("p")
            for h3 in h3s:
                hs.append(h3)
            for pp in ps:
                hs.append(pp)

            hs_parents = set()
            for h2 in hs:
                h2_parent = h2.parent
                if h2_parent in hs_parents:
                    continue
                hs_parents.add(h2_parent)
                h2_adjacent = h2_parent.children
                for adjacent in h2_adjacent:
                    if adjacent.name == "p" and adjacent.text != "\n":
                        text += adjacent.text + "\n"
                    if (
                        adjacent.name == "h2"
                        or adjacent.name == "h3"
                        or adjacent.name == "h4"
                    ):
                        text += adjacent.text + ": \n"
                    if adjacent.name == "ul" or adjacent.name == "ol":
                        text += ": "
                        for li in adjacent.find_all("li"):
                            text += li.text + ","
                        text += "\n"
            if "Why have I been blocked" in text or text == "" or text == ": \n":
                continue

            websearch.append(text)
    return websearch


def add_websearch_results(query):
    queries = []
    if "," in query:
        queries.extend(query.split(","))
    else:
        queries = [query]
    websearch_all = []

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(scrape_websearch, queries)
    for new_contexts in results:
        websearch_all.extend(new_contexts)
               
    return websearch_all


class DuckDuckGoRetriever(BaseRetriever):
    """ """

    """List of documents to retrieve from."""
    k: int
    """Number of top results to return"""

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Sync implementations for retriever."""
        matching_documents = []
        websearch = add_websearch_results(query)

        for document in websearch:
            if len(matching_documents) > self.k:
                return matching_documents

            matching_documents.append(document)
        return matching_documents


medqa_cot_data = ds["train"]




class MedQACoTRetriever(BaseRetriever):
    """List of documents to retrieve from."""

    k: int

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        """Sync implementations for retriever."""
        matching_documents = []
        # websearch = add_websearch_results(query)
        inds = []

        queries = [query]
        if "," in query:
            queries.extend(query.split(","))
        else:
            queries = [query]

        relevant_cot = []
        
        def retrieve_cot(search_query):
            with torch.no_grad():
                # tokenize the queries
                encoded = query_tokenizer(
                    [search_query],
                    truncation=True,
                    padding=True,
                    return_tensors="pt",
                    max_length=512,
                )
                embeds = query_model(**encoded).last_hidden_state[:, 0, :]
                scores, inds = index.search(embeds, k=self.k)

            curr_relevant_cot = []
            for score, ind in zip(scores[0], inds[0]):
                curr_cot = medqa_cot_data[int(ind)]
                curr_question = curr_cot["question"]
                curr_answer = curr_cot["response"]
                curr_relevant_cot.append(
                    f"\nQuestion: {curr_question}\nAnswer: Let's think step by step. {curr_answer}"
                )
            return curr_relevant_cot
        
        
        with ThreadPoolExecutor(max_workers=3) as executor:
            results = executor.map(retrieve_cot, queries)
        for new_contexts in results:
            relevant_cot.extend(new_contexts)

        return relevant_cot


websearch_retriever = DuckDuckGoRetriever(k=2)
medqa_cot_retriever = MedQACoTRetriever(k=1)



def rerank_docs_medcpt(queries, docs):
    queries = [queries]
    queries.extend(queries[0].split(","))
    relevant = set()

    def process_query(query):
        pairs = [[query, article.page_content] for article in docs]
        with torch.no_grad():
            encoded = tokenizer(
                pairs,
                truncation=True,
                padding=True,
                return_tensors="pt",
                max_length=512,
            )

            logits = model(**encoded).logits.squeeze(dim=1)
            values, indices = torch.sort(logits, descending=True)
            curr_relevant = [docs[i].page_content[1:-1].encode().decode('unicode_escape') for i in indices[:5]]
        return curr_relevant
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        results = executor.map(process_query, queries)

    for curr_relevant in results:
        relevant.update(curr_relevant)
                
    relevant = list(relevant)
    return relevant


retrieval_chain = generate_query | {
   
    "query": StrOutputParser(),
}


def format_docs(docs):
    query = docs["query"]
    chroma_docs = [doc.metadata["content"] + doc.page_content for doc in docs["chroma"]]
    relevant_cot = docs["medqa_cot"]

    docs = list(set(chroma_docs + docs["websearch"] ))
    # rerank passage2 dari document chromadb & hasil scraping webpage hasil duckduckgosearch
    relevant_docs = rerank_docs_medcpt(query, docs)
    context = " \n\n".join(doc for doc in relevant_docs)

    # tambahin few-shot chain-of-thought prompting
    relevant_cot = list(set(relevant_cot))[:2]
    for cot in relevant_cot:
        context += " \n" + cot
    return context, relevant_docs


template = """Answer the question based only on the following context:
{context}
\n

please do not mention the same answer more than once and Don't say 'the given text does not answer the user's question or is not relevant to the user's question' in your answer. just answer the question don't add any other irrelevant text
\n
Question: {question}
\n
Answer: Let's think step by step.
"""

prompt = ChatPromptTemplate.from_template(template)

answer_chain = (
     prompt
    | llm
    | {"llm_output": StrOutputParser()}
)


def retrieve_and_append(query):
    new_knowledge_base_contexts = retriever.invoke(query)
    return new_knowledge_base_contexts
    
def answer(question):
    docs = {}
    docs["chroma"] = []
    search_query = [question]
   
    with ThreadPoolExecutor(max_workers=1) as executor:
        results = executor.map(retrieve_and_append, search_query)
    for new_contexts in results:
        docs["chroma"].extend(new_contexts)
        
    relevant_docs = rerank_docs_medcpt(question, docs["chroma"])
    
    return relevant_docs




def answer_pipeline_for_eval(question):
    """
    buat evaluasi llm.
    Pertanyaan dalam bahasa inggris.
    pakai pertanyaan dari PubmedQA, MedQA -> pakai  (exact match)
    pakai: https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options/viewer/default/test?row=23
    
    """
    question = question.replace("\n", "  ")

    retrieved_contexts  = answer(question)
    return  retrieved_contexts


load INSTRUCTOR_Transformer
max_seq_length  512


In [14]:
curr_question_idx_tracker = 0
max_retries = 30

In [15]:
import ast

ast.literal_eval(testset.iloc[0]["reference_contexts"].encode().decode('unicode_escape'))

['Cell_Biology_Alberts. The images in Figure 9–1 illustrate a stepwise progression from a thumb to a cluster of atoms. Each successive image represents a tenfold increase in magnification. The naked eye can see features in the first two panels, the light microscope allows us to see details corresponding to about the fourth or fifth panel, and the electron microscope takes us to about the seventh or eighth panel. Figure 9–2 shows the sizes of various cellular and subcellular structures and the ranges of size that different types of microscopes can visualize. Looking at CeLLs in the Light MiCrosCope Looking at CeLLs anD MoLeCULes in the eLeCtron MiCrosCope 20 mm 2 mm 0.2 mm 20 µm 2 µm 0.2 µm 20 nm 2 nm 0.2 nm the Light Microscope Can resolve Details 0.2 μm apart']

In [16]:
from ragas import SingleTurnSample
from ragas.metrics import NonLLMContextPrecisionWithReference
import ast


context_precision = NonLLMContextPrecisionWithReference()


async def  evaluate_rag(curr_question_idx_tracker):
     for curr_question_idx in range(curr_question_idx_tracker, testset.shape[0]):
        question = testset.loc[curr_question_idx, "question"]
        reference_contexts = ast.literal_eval(testset.loc[curr_question_idx, "reference_contexts"].encode().decode('unicode_escape'))
        retry_count = 0 
        retrieved_contexts = answer_pipeline_for_eval(question)
        sample = SingleTurnSample(
            retrieved_contexts=retrieved_contexts, 
            reference_contexts=reference_contexts,
        )
        context_precision_score = await context_precision.single_turn_ascore(sample)
        print(f"question: {question}\nretrieved contexts: {retrieved_contexts}\nreference_contexts:{reference_contexts}")
        print(f"context precision: {context_precision_score}")


In [17]:
await evaluate_rag(0)

question: What is the smallest detail that can be resolved by a light microscope?
retrieved contexts: ['Cell_Biology_Alberts. Looking at CeLLs anD MoLeCULes in the eLeCtron MiCrosCope Light microscopy is limited in the fineness of detail that it can reveal. Microscopes using other types of radiation—in particular, electron microscopes—can resolve much smaller structures than is possible with visible light. This higher resolution comes at a cost: specimen preparation for electron microscopy is complex and it is harder to be sure that what we see in the image corresponds precisely to the original living structure. It is possible, however, to use very rapid freezing to preserve structures faithfully for electron microscopy. Digital image analysis can be used to reconstruct threedimensional objects by combining information either from many individual particles or from multiple tilted views of a single object. Together, these approaches extend the resolution and scope of electron microscopy

SyntaxError: unterminated string literal (detected at line 1) (<unknown>, line 1)

In [None]:
genai.configure(api_key="PUnyaMU")

In [None]:
ss = '"Cell_Biology_Alberts. The variations on light microscopy we have described so far are all constrained by the classic diffraction limit to resolution described earlier; that is, to about 200 nm (see Figure 9\\u20136). Yet many cellular structures\\u2014from nuclear pores to nucleosomes and clathrincoated pits\\u2014are much smaller than this and so are unresolvable by conventional light microscopy. Several approaches, however, are now available that bypass the limit imposed by the diffraction of light, and successfully allow objects as small as 20 nm to be imaged and clearly resolved: a remarkable, orderofmagnitude improvement."'

In [None]:
ss[1:-1].encode().decode('unicode_escape')