In [None]:
pip install wheel deepeval pyarrow pandas fastparquet langchain-text-splitters huggingface_hub ragas==0.0.11 python-dotenv evaluate rouge_score langchain-openai transformers langchain langchainhub langchain-huggingface faiss-cpu langchain-community sentence_transformers

In [3]:
import pandas as pd
import evaluate
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
import getpass
import torch

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = "api_key"

In [62]:
from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall
)

In [63]:
llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=OPENAI_API_KEY)

In [64]:
ucsc_qa_df = pd.read_csv("qa.csv", index_col=0)
ucsc_qa_df

Unnamed: 0_level_0,questions,answers,source_name,source
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Who can be a member of the Boating Club?,Everyone is welcome to join! If you do not hav...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html
1,What classes does the boating center offer to ...,We offer all levels of sailing (dinghy and kee...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html
2,When is the boating club open?,"The club is open year-round, Saturday and Sund...",UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html
3,What boats are available to members?,Sailing vessels for weekend use currently incl...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html
4,How much does a boating membership cost?,UCSC Student Membership: $45\nNon-Student Quar...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html
...,...,...,...,...
107,What if I don't fulfill the general education ...,If you don't satisfy general education require...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...
108,What is UC TAP?,UC Transfer Admission Planner (UC TAP) is an o...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...
109,When are acceptance notices sent out?,"For fall quarter acceptance, notices are sent ...",Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...
110,What are the Cross-Campus and Simultaneous Enr...,Undergraduate students enrolled at UCSC may en...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...


In [65]:
import pandas as pd
import evaluate
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DataFrameLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_openai import ChatOpenAI
import os
import getpass
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.llms import HuggingFaceHub
from langchain_openai import OpenAIEmbeddings

In [66]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

ucsc_passages_df = pd.read_csv("passage.csv", index_col=0)
ucsc_passge_data_loader = DataFrameLoader(ucsc_passages_df, page_content_column="passage")
ucsc_passage_data = ucsc_passge_data_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
docs = text_splitter.split_documents(ucsc_passage_data)
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
db = FAISS.from_documents(docs, embeddings)

In [67]:
# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)



# RAGAS

In [68]:
src_que_list = ucsc_qa_df['questions'].tolist()
src_ans_list = ucsc_qa_df['answers'].tolist()

In [69]:
examples = [
    {"query": q, "ground_truths": [src_ans_list[i]]}
    for i, q in enumerate(src_que_list)
]

In [70]:
print(examples[:1])

[{'query': 'Who can be a member of the Boating Club?', 'ground_truths': ["Everyone is welcome to join! If you do not have any previous sailing or rowing experience, you'll want to take a class with us before becoming a club member."]}]


In [71]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)


template = """
Given the following context {context}, answer the question like an advisor would to incoming freshman students at the University of California Santa Cruz. 
Try to be concise with the answer and give as much relevant information as possible. 

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()
retriever = db.as_retriever()

def format_docs(docs):
    x = "\n\n".join(doc.page_content for doc in docs)
    print(x)
    return x

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),  
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [72]:
#generated response
result = qa_chain({"query": src_que_list[7]})
result["result"]

'Yes, there are additional fees associated with taking out boats after joining the boating club. These fees include guest fees for non-members who will be going out on a boat. The guest fee for UCSC students is $5, and for non-students, it is $10. Guests are limited to one visit per month. Additionally, make sure to pay your membership dues and boat fees online ahead of time, as the boating center will no longer be accepting cash payments on-site.'

In [None]:
import os

from ragas.langchain.evalchain import RagasEvaluatorChain
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall
)

# create evaluation chains
faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)
answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)
context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)
context_recall_chain = RagasEvaluatorChain(metric=context_recall)



In [75]:
# Recheck the result that we are going to validate.
result.keys()

dict_keys(['query', 'result', 'source_documents'])

In [None]:
# an example
eval_result = faithfulness_chain(result)
eval_result["faithfulness_score"]
#High faithfulness_score means that there are exact consistency between the source documents and the answer.

In [108]:
# Assuming we know which question is currently being evaluated (index)
index = 7  
# Extended result dictionary
result_extended = result.copy()
result_extended['ground_truths'] = [src_ans_list[index]]

In [109]:
result_extended.keys()

dict_keys(['query', 'result', 'source_documents', 'ground_truths'])

In [110]:
result_extended

{'query': 'Are there additional fees to take out the boats after I join the boating club?',
 'result': 'Yes, there are additional fees associated with taking out boats after joining the boating club. These fees include guest fees for non-members who will be going out on a boat. The guest fee for UCSC students is $5, and for non-students, it is $10. Guests are limited to one visit per month. Additionally, make sure to pay your membership dues and boat fees online ahead of time, as the boating center will no longer be accepting cash payments on-site.',
 'source_documents': [Document(metadata={'link': 'https://recreation.ucsc.edu/boating/community-club.html', ' comments': nan}, page_content='Cost\nClub Membership Fees:\n\nUCSC Student Membership: $45\n\nNon-Student Quarterly Membership:  $150\n\nNon-Student Annual Membership:  $500\n\nFamily Discount: First member full price, additional members half price\n\nThere are a few minor fees associated with taking boats out. Guest fees apply to 

In [None]:
eval_result1 = context_recall_chain(result_extended)
print(eval_result1["context_recall_score"])

In [None]:
# run the queries as a batch for efficiency
predictions = qa_chain.batch(examples)

# evaluate
print("evaluating...")
r_faithfulness = faithfulness_chain.evaluate(examples, predictions)
r_faithfulness

In [None]:
# abstract faithfulness_score value
scores_faithfulness = [item['faithfulness_score'] for item in r_faithfulness]

average_score_faithfulness = sum(scores_faithfulness) / len(scores_faithfulness)

print(f"The average faithfulness score is: {average_score_faithfulness}")

The average faithfulness score is: 0.7609461966604824


In [None]:
predictions[:1]

[{'query': 'Who can be a member of the Boating Club?',
  'ground_truth': ["Everyone is welcome to join! If you do not have any previous sailing or rowing experience, you'll want to take a class with us before becoming a club member."],
  'result': 'Community members, UC Santa Cruz students, faculty, and staff can all be members of the Boating Club.',
  'source_documents': [Document(page_content='Our club exists to give community members, UC Santa Cruz students, faculty, and staff access to equipment that makes it possible to enjoy our beautiful Monterey Bay. The club provides members a unique opportunity to use any number of sailing and rowing vessels under the weekend supervision of the Boating Center dockmaster.', metadata={'link': 'https://recreation.ucsc.edu/boating/community-club.html', ' comments': nan}),
   Document(page_content='Our club exists to give community members, UC Santa Cruz students, faculty, and staff access to equipment that makes it possible to enjoy our beautiful

In [None]:
# evaluate context recall
print("evaluating...")
r_context_recall = context_recall_chain.evaluate(examples, predictions)
r_context_recall

evaluating...


100%|██████████| 8/8 [04:03<00:00, 30.38s/it]


[{'context_recall_score': 0.5},
 {'context_recall_score': 0.3333333333333333},
 {'context_recall_score': 1.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.38461538461538464},
 {'context_recall_score': 0.0},
 {'context_recall_score': 0.5},
 {'context_recall_score': 1.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.5},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.6666666666666666},
 {'context_recall_score': 0.5},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.0},
 {'context_recall_score': 1.0},
 {'context_recall_score': 0.25},
 {'context_recall_score': 0.3333333333333333},
 {'context_recall_score': 0.5},
 {'context

In [None]:
# abstract context_recall_score value
scores_context_recall = [item['context_recall_score'] for item in r_context_recall]

average_score_context_recall = sum(scores_context_recall) / len(scores_context_recall)

print(f"The average context_recall score is: {average_score_context_recall}")

The average context_recall score is: 0.3484755200156986


In [None]:
print("evaluating...")
r_context_rel = context_rel_chain.evaluate(examples, predictions)
r_context_rel

evaluating...


100%|██████████| 8/8 [01:59<00:00, 14.90s/it]


[{'context_ relevancy_score': 0.21573052141401503},
 {'context_ relevancy_score': 0.17635535110126843},
 {'context_ relevancy_score': 0.08829251744530418},
 {'context_ relevancy_score': 0.0700735178860751},
 {'context_ relevancy_score': 0.634806474049886},
 {'context_ relevancy_score': 0.16167906920115152},
 {'context_ relevancy_score': 0.21573052141401503},
 {'context_ relevancy_score': 1.111678679784139},
 {'context_ relevancy_score': 0.08065977195898691},
 {'context_ relevancy_score': 0.30834452311197913},
 {'context_ relevancy_score': 0.10743985573450723},
 {'context_ relevancy_score': 0.20254628111918765},
 {'context_ relevancy_score': 0.18505053293137322},
 {'context_ relevancy_score': 0.15312984585762024},
 {'context_ relevancy_score': 0.011381114522616068},
 {'context_ relevancy_score': 0.12148579955101013},
 {'context_ relevancy_score': 0.19428299665451051},
 {'context_ relevancy_score': 0.3238077561060587},
 {'context_ relevancy_score': 0.10465717315673828},
 {'context_ relev

In [None]:
# abstract context_relevancy_score value
scores_context_rel = [item['context_ relevancy_score'] for item in r_context_rel]

average_score_context_rel = sum(scores_context_rel) / len(scores_context_rel)

print(f"The average context_rel score is: {average_score_context_rel}")

The average context_rel score is: 0.3380642893840906


In [None]:
print("evaluating...")
r_answer_rel = answer_rel_chain.evaluate(examples, predictions)
r_answer_rel

evaluating...


100%|██████████| 8/8 [01:59<00:00, 14.92s/it]


[{'answer_relevancy_score': 0.9380761441335238},
 {'answer_relevancy_score': 0.9486035333993167},
 {'answer_relevancy_score': 0.9529776687373164},
 {'answer_relevancy_score': 0.9287508409938264},
 {'answer_relevancy_score': 0.9905784682991582},
 {'answer_relevancy_score': 0.9530294501482075},
 {'answer_relevancy_score': 0.9471055432847485},
 {'answer_relevancy_score': 0.977602217501384},
 {'answer_relevancy_score': 0.8887853359066543},
 {'answer_relevancy_score': 0.9751316730003912},
 {'answer_relevancy_score': 0.9882392978585598},
 {'answer_relevancy_score': 0.9011478645742099},
 {'answer_relevancy_score': 0.8982183938006174},
 {'answer_relevancy_score': 0.8901509979510568},
 {'answer_relevancy_score': 0.9170962702097168},
 {'answer_relevancy_score': 0.8765891958477233},
 {'answer_relevancy_score': 0.920794570138721},
 {'answer_relevancy_score': 0.9564428159961366},
 {'answer_relevancy_score': 0.9399329407191055},
 {'answer_relevancy_score': 0.8518916320725793},
 {'answer_relevancy_sc

In [None]:
# abstract answer_rel_score value
scores_answer_rel = [item['answer_relevancy_score'] for item in r_answer_rel]

average_score_answer_rel = sum(scores_answer_rel) / len(scores_answer_rel)

print(f"The average answer_rel score is: {average_score_answer_rel}")

The average answer_rel score is: 0.8995641732434604


# G EVAL

In [21]:
pip install deepeval ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting deepeval
  Downloading deepeval-1.0.6-py3-none-any.whl.metadata (985 bytes)
Collecting requests (from deepeval)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm (from deepeval)
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting pytest (from deepeval)
  Using cached pytest-8.3.2-py3-none-any.whl.metadata (7.5 kB)
Collecting tabulate (from deepeval)
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting typer (from deepeval)
  Downloading typer-0.12.4-py3-none-any.whl.metadata (15 kB)
Collecting rich (from deepeval)
  Using cached rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting protobuf==4.25.1 (from deepeval)
  Using cached protobuf-4.25.1-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Collecting pydantic (from deepeval)
  Using cached pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
Collecting sentry-sdk (from deepeval)
  Using cached sentry_sdk-2.13.0-py2.py3-none-any.whl.metadata

In [1]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

In [4]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [6]:
from deepeval.test_case import LLMTestCase

In [8]:
import pandas as pd

In [9]:
ucsc_qa_df = pd.read_csv("/Users/jonathanmorris/Developer/AIEA/LLM-logic/backend/src/model/new_qa.csv", index_col=0)

In [10]:
src_que_list = ucsc_qa_df['questions'].tolist()
src_ans_list = ucsc_qa_df['answers'].tolist()
gen_ans_list = ucsc_qa_df["generated_response"].tolist()

In [12]:
test_case = LLMTestCase(
    input=src_que_list[0],
    actual_output=gen_ans_list[0],
    expected_output=src_ans_list[0]
)
test_case

LLMTestCase(input='Who can be a member of the Boating Club?', actual_output='Members of the Boating Club can include community members, UC Santa Cruz students, faculty, and staff.', expected_output="Everyone is welcome to join! If you do not have any previous sailing or rowing experience, you'll want to take a class with us before becoming a club member.", context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_used=None, expected_tools=None, reasoning=None)

In [13]:
correctness_metric.measure(test_case)

In [14]:
print(correctness_metric.score)
print(correctness_metric.reason)

0.9094460296041593
The actual output includes community members, UC Santa Cruz students, faculty, and staff, which aligns with the expected details. It is very comprehensive but slightly vague on whether these are the only eligible members.


In [None]:
reason = []
score = []
for i in range(len(src_que_list)):
    test_case = LLMTestCase(
    input=src_que_list[i],
    actual_output=gen_ans_list[i],
    expected_output=src_ans_list[i]
    )
    
    correctness_metric.measure(test_case)
    reason.append(correctness_metric.reason) 
    score.append(correctness_metric.score)



In [None]:
pip install numpy

In [21]:
import numpy as np

np_score = np.array(score)
np_score.mean()

0.6766009465005097

In [22]:
ucsc_qa_df['g_eval_score'] = score
ucsc_qa_df['g_eval_reason'] = reason

In [23]:
ucsc_qa_df

Unnamed: 0_level_0,questions,answers,source_name,source,generated_response,source_docs,g_eval_score,g_eval_reason
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Who can be a member of the Boating Club?,Everyone is welcome to join! If you do not hav...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html,Members of the Boating Club can include commun...,"Our club exists to give community members, UC ...",0.909446,The actual output correctly identifies communi...
1,What classes does the boating center offer to ...,We offer all levels of sailing (dinghy and kee...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html,The boating center offers classes for beginner...,UC Santa Cruz Community Boating Center is a co...,0.815102,The actual output matches the expected output ...
2,When is the boating club open?,"The club is open year-round, Saturday and Sund...",UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html,The boating club is open on Saturday and Sunda...,Boating Club Hours\nThe Community Boating Cent...,0.562117,The actual output provides specific opening ho...
3,What boats are available to members?,Sailing vessels for weekend use currently incl...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html,Members have access to the following sailing v...,Our boats:\nSailing vessels for weekend use cu...,0.964788,The actual output provides an extensive list o...
4,How much does a boating membership cost?,UCSC Student Membership: $45\nNon-Student Quar...,UCSC Boating Recreation FQAs,https://recreation.ucsc.edu/boating/faq.html,The provided context does not include any info...,There are a few minor fees associated with tak...,0.369330,The actual output correctly states that the pr...
...,...,...,...,...,...,...,...,...
107,What if I don't fulfill the general education ...,If you don't satisfy general education require...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...,If you do not complete the IGETC or partial IG...,"By the end of the fall term prior to transfer,...",0.873402,The actual output addresses the consequences o...
108,What is UC TAP?,UC Transfer Admission Planner (UC TAP) is an o...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...,"UC TAP, or the UC Transfer Admission Planner, ...",UCSC TPP\nUCSC TAG Step-by-Step\nComplete the ...,0.901846,"The actual output accurately describes UC TAP,..."
109,When are acceptance notices sent out?,"For fall quarter acceptance, notices are sent ...",Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...,The context does not specify when acceptance n...,First Year\t\nApplication Submission\n\nMay 20...,0.478547,The actual output correctly states that the co...
110,What are the Cross-Campus and Simultaneous Enr...,Undergraduate students enrolled at UCSC may en...,Undergraduate Admissions FAQ,https://admissions.ucsc.edu/resources-support/...,The provided context does not include informat...,Graduating student\nDual Admission\nDual Admis...,0.207676,The actual output does not provide any informa...
