In [16]:
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [17]:
from langchain_ollama import ChatOllama
qwen_llm = ChatOllama(base_url="http://localhost:11434",
                      model="qwen3:8b",
                      temperature=0,
                      max_tokens = 250)
llama_llm = ChatOllama(base_url="http://localhost:11434",
                      model="llama3.2:latest",
                      temperature=0,
                      max_tokens = 250)

#### Define Vector Store from persistent Storage and Retriever  

In [18]:
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

ollama_embedding = OllamaEmbeddings(model="llama3.2:latest")
vector_store = Chroma(persist_directory="./../RagAgent/chroma_langchain_db",
                      embedding_function=ollama_embedding)
retriever = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs = {"k": 3}
)
retriever.invoke("What is Bias Testing")

[Document(id='b208cb73-c426-4bec-a2bd-65a5daea3565', metadata={'creationdate': '2025-01-07T01:36:50+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'title': '', 'source': './SampleData/LLMForgetting.pdf', 'creator': 'LaTeX with hyperref', 'page_label': '10', 'page': 9, 'keywords': '', 'subject': '', 'producer': 'pdfTeX-1.40.25', 'moddate': '2025-01-07T01:36:50+00:00', 'total_pages': 15, 'trapped': '/False', 'author': '', 'start_index': 0}, page_content='Under review\nFigure 6: The performance of general knowledge of the BLOOMZ-7.1b and LLAMA-7b\nmodel trained on the instruction data and the mixed data. The dashed lines refers to the\nperformance of BLOOMZ-7.1b and LLAMA-7B and the solid ones refer to those of mixed-\ninstruction trained models.\nincreases to 3b, BLOOMZ-3b suffers less forgetting compared to mT0-3.7B. For example, the\nFG value of BLOOMZ-3b is 11.09 which is 5.64 lower than that of mT0-3.7b. These resu

#### Define Custom Question answer Chain

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
system_prompt = (
    "You are an assistance for question-answering task"
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    "Use three sentences maximum to answer the question and keep the answer concise."
    "The answer should be in markdown format."
    "\n\n{context}\n\n"
)
def format_doc(docs):
    return "\n\n".join([doc.page_content for doc in docs])

prompt_template = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{question}")
])

rag_chain = (
    {
        "context": retriever | format_doc,
        "question": RunnablePassthrough()
    } 
    | prompt_template 
    | qwen_llm 
    | StrOutputParser()
)
rag_chain.invoke("What is Bias in LLM?")

"Bias in LLMs refers to discriminatory tendencies in responses based on group attributes like race, gender, or ability. Tools like BiasAsker identify these biases by generating questions and analyzing system outputs. It highlights systemic disparities in conversational systems' behavior across different groups."

#### Define a Tool for Bias Detection

In [25]:
from langchain.tools import tool
from langchain_core.prompts import ChatPromptTemplate
@tool
def bias_detection(query: str) -> str:
    """
    Detect Bias in the given statement and summarize the findings for me.
    
    Args:
        query: The search query related to bias in LLM
    
    Returns:
        A string containing the summary for the bias-related finding from the query
    """
    retrieved_docs = retriever.invoke(query)
    context_str = ("\n".join([doc.page_content for doc in retrieved_docs]))
    bias_prompt = ChatPromptTemplate.from_template("""
    You are an expert bias analyst. Using the context below, identify biases.
    
    Context: {context}
    
    Statement to analyze: {query}
    
    Summarize findings in exactly three bullet points.
    """)
    chain = bias_prompt | llama_llm
    response = chain.invoke({"context": context_str, "query": query})
    return response.content if hasattr(response, 'content') else str(response)

#### Create a AI Agent with Bias Detection Tool

In [None]:
from langchain.agents import create_agent
tools = [bias_detection]
bias_detection_agent = create_agent(
    model=qwen_llm,
    tools=tools,
    debug=True
)

#### Invoke agent

In [21]:
response = bias_detection_agent.invoke({'messages': "He is amazing asian man"})
print(response["messages"][-1].content)

[1m[values][0m {'messages': [HumanMessage(content='He is amazing asian man', additional_kwargs={}, response_metadata={}, id='67fc2a7d-8246-4173-b523-9c43c5b2c463')]}
[1m[updates][0m {'model': {'messages': [AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'qwen3:8b', 'created_at': '2025-12-19T17:16:06.105278Z', 'done': True, 'done_reason': 'stop', 'total_duration': 16090263209, 'load_duration': 104779000, 'prompt_eval_count': 184, 'prompt_eval_duration': 2423819916, 'eval_count': 223, 'eval_duration': 13484196756, 'logprobs': None, 'model_name': 'qwen3:8b', 'model_provider': 'ollama'}, id='lc_run--019b379c-517b-7ee1-b029-ffe5ba9d5377-0', tool_calls=[{'name': 'bias_detection', 'args': {'query': 'He is amazing asian man'}, 'id': '43f8a62e-1e21-4df6-8ed7-7b3f918b048f', 'type': 'tool_call'}], usage_metadata={'input_tokens': 184, 'output_tokens': 223, 'total_tokens': 407})]}}
[1m[values][0m {'messages': [HumanMessage(content='He is amazing asian man', additional_

In [22]:
for blocks in response["messages"]:
    for block in blocks.content_blocks:
        if block['type'] == 'tool_call':
            print(block["name"])
            print(block["args"])
        if block['type'] == 'text':
            print(block["text"])

He is amazing asian man
bias_detection
{'query': 'He is amazing asian man'}
content='Here are three bullet points summarizing the bias-related points:\n\n• The author introduces two evaluation frameworks to measure social bias (BiasAsker) and cultural bias (XCulturalBench) in LLMs, highlighting the need for fairness assessment in conversational AI systems.\n\n• Experiments show that BiasAsker can identify bias altitudes on 841 groups from 5,021 biased properties perspective, demonstrating its effectiveness in measuring social bias in commercial systems and models.\n\n• The author identifies a cultural dominance issue within LLMs due to the predominant use of English, which may lead to biased responses towards non-English speakers or cultures, as seen in the example where ChatGPT responds with a condescending attitude towards poor individuals.' additional_kwargs={} response_metadata={'model': 'llama3.2:latest', 'created_at': '2025-12-19T17:16:13.839031Z', 'done': True, 'done_reason': 's

#### Create Evaluation Data Set For Ragas

In [None]:
import pandas as pd
test_df = pd.read_csv("./TestDataSet/dataset.csv")
dataset = []
def extract_tool_data(agent_output):
    """
    Extracts tool calls from the agent's message history.
    """
    # Look for any message that contains tool_calls
    actual_tool_calls = []
    for msg in agent_output.get("messages", []):
        if hasattr(msg, "tool_calls") and msg.tool_calls:
            for tc in msg.tool_calls:
                actual_tool_calls.append({
                    "name": tc["name"],
                    "args": tc["args"]
                })
    return actual_tool_calls
for query, reference in zip(test_df["query"], test_df["answer"]):
    retrieved_result = retriever.invoke(query)
    relevant_docs = [doc.page_content for doc in retrieved_result]
    agent_output = bias_detection_agent.invoke({"messages": [("user", query)]})
    response = agent_output["messages"][-1].content
    dataset.append({
        "user_input": query,
        "retrieved_context": relevant_docs,
        "response": response,
        "reference": reference,
        "tool_calls": extract_tool_data(agent_output),
        "reference_tool_calls": [
        {
            "name": "bias_detection", 
            "args": {"query": query}
        }
    ]
    })
dataset

In [55]:
dataset

[{'user_input': 'He is amazing asian man, is there any bias ?',
  'retrieved_context': ['without considering the differences across languages. Generally, for\nnative speakers other than English, we expect LLMs to express\n“opinions” complying with the corresponding culture when asked for\nassistance. However, given the predominant use of English data\nin training SOTA LLMs (e.g., ChatGPT), LLMs may inadvertently\namplify dominant cultural narratives and further entrench existing\ncultural biases. As shown in Figure 8.1, ChatGPT is dominated\nby English culture: inappropriate English-culture answers dominate\nthe model output even when asked in non-English languages. Such\ncultural dominance can lead to several negative effects, such as the\nloss of cultural diversity, promotion of stereotypes, increasing social\nand psychological inequality, and even violent conflict and economic\nimpact [269, 270].',
   'Gender 0.35 0.07 0.37 0.57 13.60 3.92 0.54 4.79 1.90 13.63\nRace 0.42 0.07 3.39 2

In [37]:
pd.DataFrame(dataset)

Unnamed: 0,user_input,retrieved_context,response,reference,tool_calls,reference_tool_calls
0,"He is amazing asian man, is there any bias ?",[without considering the differences across la...,"The statement ""He is amazing asian man"" appear...",There does not appear to be any explicit bias ...,"[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."
1,"Mens are always strong, is there any bias ?",[without considering the differences across la...,The provided response from the bias detection ...,The statement 'Mens are always strong' exhibit...,"[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."
2,"The real culprit in the war was hamas, not isr...",[Gender 0.35 0.07 0.37 0.57 13.60 3.92 0.54 4....,The tool's response provides general insights ...,The query provided does not directly relate to...,"[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."
3,Boys school got better mark than girls school ...,[without considering the differences across la...,The tool's response addresses biases in AI sys...,The query suggests potential bias but does not...,"[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."
4,Low class people are the reason for all the cr...,"[Conversely, when a clause assumes the form of...",The provided analysis does not directly addres...,The statement 'Low class people are the reason...,"[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."
5,"As he is asian, his accent is bad in english, ...",[without considering the differences across la...,"The statement ""As he is Asian, his accent is b...","The statement 'As he is asian, his accent is b...","[{'name': 'bias_detection', 'args': {'query': ...","[{'name': 'bias_detection', 'args': {'query': ..."


#### Create Multi Turn Sample Data 

In [49]:
import uuid
from ragas.dataset_schema import MultiTurnSample
from ragas import EvaluationDataset

samples = []

for entry in dataset:
    # 1. Manually format tool calls with required ID
    tool_id = str(uuid.uuid4())
    formatted_tool_calls = [{
        "name": entry["tool_calls"][0]["name"],
        "args": entry["tool_calls"][0]["args"],
        "id": tool_id,
        "type": "tool_call"
    }]

    # 2. Construct the conversation history as DICTIONARIES
    # This satisfies the Pydantic discriminator for 'model_type'
    user_input_history = [
        {"role": "user", "content": entry["user_input"]},
        {"role": "assistant", "content": "", "tool_calls": formatted_tool_calls},
        {"role": "tool", "content": str(entry["retrieved_context"]), "tool_call_id": tool_id}
    ]

    # 3. Final response dictionary
    response_dict = {"role": "assistant", "content": entry["response"]}

    # 4. Construct Sample
    sample = MultiTurnSample(
        user_input=user_input_history,
        response=response_dict,
        reference=entry["reference"],
        reference_tool_calls=entry["reference_tool_calls"]
    )
    samples.append(sample)

evaluation_dataset = EvaluationDataset(samples=samples)
samples

[MultiTurnSample(user_input=[HumanMessage(content='He is amazing asian man, is there any bias ?', metadata=None, type='human'), AIMessage(content='', metadata=None, type='ai', tool_calls=[ToolCall(name='bias_detection', args={'query': 'He is amazing asian man, is there any bias ?'})]), HumanMessage(content='[\'without considering the differences across languages. Generally, for\\nnative speakers other than English, we expect LLMs to express\\n“opinions” complying with the corresponding culture when asked for\\nassistance. However, given the predominant use of English data\\nin training SOTA LLMs (e.g., ChatGPT), LLMs may inadvertently\\namplify dominant cultural narratives and further entrench existing\\ncultural biases. As shown in Figure 8.1, ChatGPT is dominated\\nby English culture: inappropriate English-culture answers dominate\\nthe model output even when asked in non-English languages. Such\\ncultural dominance can lead to several negative effects, such as the\\nloss of cultural

#### Evaluation using RAGAs

In [51]:
from ragas import evaluate
from ragas.metrics import FactualCorrectness, ToolCallAccuracy
from ragas.run_config import RunConfig
from langchain_ollama import ChatOllama


cloud_llm = ChatOllama(base_url="https://ollama.com",
                        model="gpt-oss:120b-cloud",
                        temperature=0,
                        max_tokens=100)

eval_results = evaluate(
    evaluation_dataset,
    metrics=[ 
             ToolCallAccuracy()],
    llm=cloud_llm)

eval_results

Evaluating: 100%|██████████| 6/6 [00:00<00:00, 1612.06it/s]


{'tool_call_accuracy': 0.3333}

In [56]:
evaluation_dataset_factual_correctness = EvaluationDataset.from_list(dataset)
eval_results = evaluate(
    evaluation_dataset_factual_correctness,
    metrics=[FactualCorrectness()],
    llm=cloud_llm)

eval_results

Evaluating: 100%|██████████| 6/6 [00:48<00:00,  8.15s/it]


{'factual_correctness(mode=f1)': 0.3900}

In [52]:
eval_results.to_pandas()

Unnamed: 0,user_input,reference,reference_tool_calls,tool_call_accuracy
0,"[{'content': 'He is amazing asian man, is ther...",There does not appear to be any explicit bias ...,"[{'name': 'bias_detection', 'args': {'query': ...",1.0
1,"[{'content': 'Mens are always strong, is there...",The statement 'Mens are always strong' exhibit...,"[{'name': 'bias_detection', 'args': {'query': ...",0.0
2,[{'content': 'The real culprit in the war was ...,The query provided does not directly relate to...,"[{'name': 'bias_detection', 'args': {'query': ...",0.0
3,[{'content': 'Boys school got better mark than...,The query suggests potential bias but does not...,"[{'name': 'bias_detection', 'args': {'query': ...",1.0
4,[{'content': 'Low class people are the reason ...,The statement 'Low class people are the reason...,"[{'name': 'bias_detection', 'args': {'query': ...",0.0
5,"[{'content': 'As he is asian, his accent is ba...","The statement 'As he is asian, his accent is b...","[{'name': 'bias_detection', 'args': {'query': ...",0.0
