In [1]:
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
import pickle




In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ['HUGGINGFACEHUB_API_TOKEN']=os.getenv("HUGGINGFACEHUB_API_TOKEN")
groq_api_key = os.environ["GROQ_API_KEY"]

# Naive RAG

In [3]:
hf=HuggingFaceHub(
    repo_id="meta-llama/Meta-Llama-3-70B",
    model_kwargs={"temperature":0.1}

)

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
llm = ChatGroq(groq_api_key=groq_api_key,model_name="llama3-70b-8192")

In [111]:
query = "What are the advantages of a 3D printer?"

prompt = f"Refine the following query for better retrieval. Only return the refined query, without any additional text or explanation:\n\n{query}"

# Get the refined query from the LLM
response = llm.invoke(prompt)

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


In [112]:
response.content

'What are the benefits and advantages of using 3D printing technology?'

In [62]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/chunks.pkl"
with open(pickle_file, "rb") as file:
    docs_manual = pickle.load(file)

In [63]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/alarm_info_list.pkl"
with open(pickle_file, "rb") as file:
    docs_alarm = pickle.load(file)

In [66]:
docs_list = docs_manual + docs_alarm

In [68]:
len(docs_list)

3315

In [78]:
from langchain.docstore.document import Document

documents_manual = [
    Document(page_content=text, metadata={"source": "TULUS manual software"})
    for text in docs_manual
]

documents_alarm = [
    Document(page_content=text, metadata={"source": "TULUS - Alarms"})
    for text in docs_alarm
]

docs = documents_manual + documents_alarm

In [94]:
output_pickle_file = "docs.pkl"
with open(output_pickle_file, 'wb') as f:
    # Dump the concatenated list into the pickle file
    pickle.dump(docs, f)

In [79]:
# Initialize the embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store_chroma = Chroma.from_documents(docs,embeddings)

In [80]:
vector_store_faiss = FAISS.from_documents(docs,embeddings)



In [81]:
# Define a prompt template for the language model
prompt_template = PromptTemplate(input_variables=["context", "question"],
                                 template="Given the following context: {context}, answer the question: {question}")


In [85]:
retriever=vector_store_chroma.as_retriever(search_type="similarity",search_kwargs={"k":4})
# retriever=vector_store_chroma.as_retriever(search_type="similarity")

In [31]:

retrievalQA=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt_template}
)

In [87]:
# Function to retrieve answer and chunks used
def get_answer_and_chunks(query):
    # Use the qa_chain to get the answer and source documents
    
    result = retrievalQA.invoke({"query": query})
    
    # Extract the answer and source documents
    answer = result["result"]
    source_documents = result["source_documents"]
    
    return answer, source_documents

# Define a query and retrieve the answer along with chunks
query = "how many alari Ids are there in tulus"
answer, source_docs = get_answer_and_chunks(query)

print("Answer:", answer)
print("Chunks Used:")


if source_docs:
    for index,chunk in enumerate(source_docs):
        print("Document :", index)
        # print(chunk.page_content)
        print("Metadata:", chunk.metadata)
        print()
else:
    print("No chunks used")

Answer: There are 4 Alarm IDs mentioned in the provided context:

1. Alarm ID: 32/147
2. Alarm ID: 352
3. Alarm ID: 22/44
4. Alarm ID: 0/1282
Chunks Used:
Document : 0
Metadata: {'source': 'TULUS - Alarms'}

Document : 1
Metadata: {'source': 'TULUS - Alarms'}

Document : 2
Metadata: {'source': 'TULUS - Alarms'}

Document : 3
Metadata: {'source': 'TULUS - Alarms'}



# Evaluation

## replace index in original dataset

In [55]:
import pandas as pd
file_path='/Users/hadiibrahim/Dev/prima-power-hmi-assistant/src/data/main_manual_file_with_index (1).csv'
df=pd.read_csv(file_path,sep=';')

In [57]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/chunks.pkl"
with open(pickle_file, "rb") as file:
    chunks = pickle.load(file)

In [None]:
def replace_context(df, new_context_list):
    # Replace each context value based on the context_index
    df['context'] = df['closest_text_index'].apply(lambda x: new_context_list[x])
    return df

# Apply the function
df = replace_context(df, chunks)


In [68]:
df_main= df.drop(columns=["correct_answers","incorrect_answers","closest_text_index","category"])

In [71]:
df_main.rename(columns={"best_answer":"answer"},inplace=True)
df_main.rename(columns={"context":"contexts"},inplace=True)


In [76]:
df_main

Unnamed: 0,contexts,question,answer
0,\n2. TULUS user interface\n2.1 Introduction\n...,What is the main purpose of the Tulus software?,The main purpose of the Tulus software is to m...
1,\n2. TULUS user interface\n2.1 Introduction\n...,What are some key components of the Tulus user...,Some key components of the Tulus user interfac...
2,\n2. TULUS user interface\n2.1 Introduction\n...,What functions are available on the vertical t...,The vertical toolbar in the Tulus interface in...
3,\n2. TULUS user interface\n2.1 Introduction\n...,What information is displayed in the notificat...,The notification area displays messages about ...
4,\n2. TULUS user interface\n2.1 Introduction\n...,What kind of controls and information can be f...,"The ""Customer"" section of the Tulus interface ..."


## RAGAS Evaluation

### Faithfulness 
This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.
The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not. The faithfulness score is given by divided by (1) 

Faithfulness score = Number of claims in the generated answer that can be inferred from given context / Total number of claims in the generated answer|

### Answer Relevance

The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This metric is computed using the question, the context and the answer.

### Context Precision

Context Precision is a metric that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This metric is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision.

### Context Relevancy

This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
Ideally, the retrieved context should exclusively contain essential information to address the provided query. To compute this, we initially estimate the value of S| by identifying sentences within the retrieved context that are relevant for answering the given question. The final score is determined by the following formula:

context relevancy = |S| / |Total number of sentences in retrieved context|

### Context Recall 

Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.

To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.

The formula for calculating context recall is as follows:


context recall = |GT sentences that can be attributed to context| / Number of sentences in GT

### Answer Correctness

The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.

Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score. Users also have the option to employ a ‘threshold’ value to round the resulting score to binary, if desired.

In [88]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)


from ragas.metrics.critique import harmfulness
from ragas import evaluate

In [95]:
from tqdm import tqdm
import pandas as pd
from datasets import Dataset



rag_dataset = []

def create_ragas_dataset(df_main,column_name="answer"):
  for index, row in tqdm(df_main.iterrows(), total=df_main.shape[0]):
    answer, source_docs = get_answer_and_chunks(row["question"])
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer,
         "contexts" : [context.page_content for context in source_docs],
         "ground_truths" : [row[column_name]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset



In [96]:
def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

# Test Set Generation

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# documents = load your documents

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Change resulting question type distribution
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
testset = generator.generate_with_langchain_docs(docs, 6000, distributions) 
testset.to_pandas()

                                                                   Exception in thread Thread-90:
Traceback (most recent call last):
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1191, in __iter__
    self.update(n - last_print_n)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1242, in update
    self.refresh(lock_args=self.lock_args)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1495, in display
    self.sp(self.__str__() if msg is None else msg)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 459, in print_status
    fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-pac

In [58]:
df3=pd.DataFrame(testset.to_pandas())
df3.to_csv("testset.csv",index=False)

In [53]:
basic_qa_ragas_dataset = create_ragas_dataset(pd.DataFrame(testset.to_pandas()),"ground_truth")

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [03:50<00:00, 23.00s/it]


In [54]:
basic_qa_ragas_dataset.to_pandas()

Unnamed: 0,question,answer,contexts,ground_truths
0,What are the different functional areas within...,Based on the provided screenshots and descript...,[On the topleft there is a vertical navigation...,[The different functional areas within the app...
1,What can be accessed through the separate menu...,Through the separate menu in the Tulus system ...,[6. TULUS System management and settings\nThe...,[The tools needed for general management of th...
2,What is the purpose of the dropdown menus in t...,The purpose of the dropdown menus in the inter...,[The interface is divided into several section...,[The purpose of the dropdown menus in the inte...
3,What is the current status of the FPServiceBro...,The current status of the FPServiceBroker serv...,[The image appears to be a screenshot of a gra...,[The current status of the FPServiceBroker ser...
4,What is the purpose of the hierarchical struct...,The purpose of the hierarchical structure in t...,[On the top left corner there is a menu bar wi...,[The purpose of the hierarchical structure in ...
5,What does the license management tool do in th...,The license management tool in the software in...,[An activated license is valid only for the so...,[The license management tool in the software i...
6,What is the purpose of the main menu in the so...,The main menu in the software for laser operat...,[laser. 2 Play/Stop Button Start/stop a task l...,[The main menu in the software for laser opera...
7,What does the Cancel button do in the producti...,The Cancel button in the production order mana...,[At the bottom there are two buttons Add highl...,[The Cancel button allows the user to exit wit...
8,What is the purpose of MDA mode in the control...,"According to the text, MDA mode allows to run ...",[The center of the interface displays operatio...,[MDA mode allows for running utility programs ...
9,How does the hierarchical structure in the nav...,The hierarchical structure in the navigation p...,[On the top left corner there is a menu bar wi...,[The hierarchical structure in the navigation ...


In [56]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [01:56<00:00,  1.66s/it]


In [57]:
basic_qa_result

{'context_precision': 0.9917, 'faithfulness': 0.7871, 'answer_relevancy': 0.9716, 'context_recall': 1.0000, 'context_relevancy': 0.0255, 'answer_correctness': 0.6130, 'answer_similarity': 0.9552}

In [33]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [02:33<00:00,  2.19s/it]


In [34]:
basic_qa_result

{'context_precision': 0.9833, 'faithfulness': 0.9583, 'answer_relevancy': 0.9720, 'context_recall': 0.9000, 'context_relevancy': 0.0325, 'answer_correctness': 0.6566, 'answer_similarity': 0.9558}

In [None]:
import os
import random
import numpy as np
import logging
import json
import pickle
from dotenv import load_dotenv
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity,
)
from langchain.retrievers.document_compressors import FlashrankRerank
from ragas.metrics.critique import harmfulness
from ragas import evaluate
from ragatouille import RAGPretrainedModel
from langchain.retrievers import ContextualCompressionRetriever

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ensure the correct forward reference
# FlashrankRerank.update_forward_refs(Ranker=RankerModel)

# Now, you can use the FlashrankRerank with the RankerModel
compressor = FlashrankRerank(model_name="ms-marco-MultiBERT-L-12")


In [48]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain_openai import ChatOpenAI

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

ValidationError: 1 validation error for FlashrankRerank
client
  instance of RankerModel expected (type=type_error.arbitrary_type; expected_arbitrary_type=RankerModel)

In [26]:
from flashrank import Ranker, RerankRequest

# Small (~34MB), slightly slower & best performance (ranking precision).
ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")

In [60]:
from flashrank import Ranker, RerankRequest
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
from langchain_cohere import CohereRerank
import os

load_dotenv()

COHERE_API_KEY = os.getenv("COHERE_API_KEY")

embeddings = OpenAIEmbeddings()

vector_store = FAISS.from_documents(docs, embeddings)

retriever = vector_store.as_retriever(
            search_type="similarity", search_kwargs={"k": 10})





INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [87]:
import os
HUGGINGFACE_API_TOKEN=os.getenv("HUGGINGFACEHUB_API_TOKEN")
groq_api_key=os.getenv("GRORQ_API_KEY")

In [88]:
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
import pickle

llm=ChatGroq(groq_api_key=groq_api_key, model_name='llama3-70b-8192')


In [92]:
result = llm.invoke("what is the capital of France")



INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


In [None]:
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
import pickle




In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ['HUGGINGFACEHUB_API_TOKEN']=os.getenv("HUGGINGFACEHUB_API_TOKEN")
groq_api_key = os.environ["GROQ_API_KEY"]

# Naive RAG

In [None]:
hf=HuggingFaceHub(
    repo_id="meta-llama/Meta-Llama-3-70B",
    model_kwargs={"temperature":0.1}

)

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
llm = ChatGroq(groq_api_key=groq_api_key,model_name="llama3-70b-8192")

In [None]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/chunks.pkl"
with open(pickle_file, "rb") as file:
    docs_manual = pickle.load(file)

In [None]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/alarm_info_list.pkl"
with open(pickle_file, "rb") as file:
    docs_alarm = pickle.load(file)

In [None]:
docs_list = docs_manual + docs_alarm

In [None]:
len(docs_list)

3315

In [None]:
from langchain.docstore.document import Document

documents_manual = [
    Document(page_content=text, metadata={"source": "TULUS manual software"})
    for text in docs_manual
]

documents_alarm = [
    Document(page_content=text, metadata={"source": "TULUS - Alarms"})
    for text in docs_alarm
]

docs = documents_manual + documents_alarm

In [None]:
output_pickle_file = "docs.pkl"
with open(output_pickle_file, 'wb') as f:
    # Dump the concatenated list into the pickle file
    pickle.dump(docs, f)

In [None]:
# Initialize the embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store_chroma = Chroma.from_documents(docs,embeddings)

In [None]:
vector_store_faiss = FAISS.from_documents(docs,embeddings)



In [None]:
# Define a prompt template for the language model
prompt_template = PromptTemplate(input_variables=["context", "question"],
                                 template="Given the following context: {context}, answer the question: {question}")


In [None]:
retriever=vector_store_chroma.as_retriever(search_type="similarity",search_kwargs={"k":4})
# retriever=vector_store_chroma.as_retriever(search_type="similarity")

In [None]:

retrievalQA=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt_template}
)

In [None]:
# Function to retrieve answer and chunks used
def get_answer_and_chunks(query):
    # Use the qa_chain to get the answer and source documents
    
    result = retrievalQA.invoke({"query": query})
    
    # Extract the answer and source documents
    answer = result["result"]
    source_documents = result["source_documents"]
    
    return answer, source_documents

# Define a query and retrieve the answer along with chunks
query = "how many alari Ids are there in tulus"
answer, source_docs = get_answer_and_chunks(query)

print("Answer:", answer)
print("Chunks Used:")


if source_docs:
    for index,chunk in enumerate(source_docs):
        print("Document :", index)
        # print(chunk.page_content)
        print("Metadata:", chunk.metadata)
        print()
else:
    print("No chunks used")

Answer: There are 4 Alarm IDs mentioned in the provided context:

1. Alarm ID: 32/147
2. Alarm ID: 352
3. Alarm ID: 22/44
4. Alarm ID: 0/1282
Chunks Used:
Document : 0
Metadata: {'source': 'TULUS - Alarms'}

Document : 1
Metadata: {'source': 'TULUS - Alarms'}

Document : 2
Metadata: {'source': 'TULUS - Alarms'}

Document : 3
Metadata: {'source': 'TULUS - Alarms'}



# Evaluation

## replace index in original dataset

In [None]:
import pandas as pd
file_path='/Users/hadiibrahim/Dev/prima-power-hmi-assistant/src/data/main_manual_file_with_index (1).csv'
df=pd.read_csv(file_path,sep=';')

In [None]:
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/chunks.pkl"
with open(pickle_file, "rb") as file:
    chunks = pickle.load(file)

In [None]:
def replace_context(df, new_context_list):
    # Replace each context value based on the context_index
    df['context'] = df['closest_text_index'].apply(lambda x: new_context_list[x])
    return df

# Apply the function
df = replace_context(df, chunks)


In [None]:
df_main= df.drop(columns=["correct_answers","incorrect_answers","closest_text_index","category"])

In [None]:
df_main.rename(columns={"best_answer":"answer"},inplace=True)
df_main.rename(columns={"context":"contexts"},inplace=True)


In [None]:
df_main

Unnamed: 0,contexts,question,answer
0,\n2. TULUS user interface\n2.1 Introduction\n...,What is the main purpose of the Tulus software?,The main purpose of the Tulus software is to m...
1,\n2. TULUS user interface\n2.1 Introduction\n...,What are some key components of the Tulus user...,Some key components of the Tulus user interfac...
2,\n2. TULUS user interface\n2.1 Introduction\n...,What functions are available on the vertical t...,The vertical toolbar in the Tulus interface in...
3,\n2. TULUS user interface\n2.1 Introduction\n...,What information is displayed in the notificat...,The notification area displays messages about ...
4,\n2. TULUS user interface\n2.1 Introduction\n...,What kind of controls and information can be f...,"The ""Customer"" section of the Tulus interface ..."


## RAGAS Evaluation

### Faithfulness 
This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context. The answer is scaled to (0,1) range. Higher the better.
The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not. The faithfulness score is given by divided by (1) 

Faithfulness score = Number of claims in the generated answer that can be inferred from given context / Total number of claims in the generated answer|

### Answer Relevance

The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information and higher scores indicate better relevancy. This metric is computed using the question, the context and the answer.

### Context Precision

Context Precision is a metric that evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This metric is computed using the question, ground_truth and the contexts, with values ranging between 0 and 1, where higher scores indicate better precision.

### Context Relevancy

This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
Ideally, the retrieved context should exclusively contain essential information to address the provided query. To compute this, we initially estimate the value of S| by identifying sentences within the retrieved context that are relevant for answering the given question. The final score is determined by the following formula:

context relevancy = |S| / |Total number of sentences in retrieved context|

### Context Recall 

Context recall measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. It is computed based on the ground truth and the retrieved context, and the values range between 0 and 1, with higher values indicating better performance.

To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.

The formula for calculating context recall is as follows:


context recall = |GT sentences that can be attributed to context| / Number of sentences in GT

### Answer Correctness

The assessment of Answer Correctness involves gauging the accuracy of the generated answer when compared to the ground truth. This evaluation relies on the ground truth and the answer, with scores ranging from 0 to 1. A higher score indicates a closer alignment between the generated answer and the ground truth, signifying better correctness.

Answer correctness encompasses two critical aspects: semantic similarity between the generated answer and the ground truth, as well as factual similarity. These aspects are combined using a weighted scheme to formulate the answer correctness score. Users also have the option to employ a ‘threshold’ value to round the resulting score to binary, if desired.

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)


from ragas.metrics.critique import harmfulness
from ragas import evaluate

In [None]:
from tqdm import tqdm
import pandas as pd
from datasets import Dataset



rag_dataset = []

def create_ragas_dataset(df_main,column_name="answer"):
  for index, row in tqdm(df_main.iterrows(), total=df_main.shape[0]):
    answer, source_docs = get_answer_and_chunks(row["question"])
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer,
         "contexts" : [context.page_content for context in source_docs],
         "ground_truths" : [row[column_name]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset



In [None]:
def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

# Test Set Generation

In [1]:
import pickle
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/src/docs.pkl"
with open(pickle_file, "rb") as file:
    docs = pickle.load(file)

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# documents = load your documents

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Change resulting question type distribution
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
testset = generator.generate_with_langchain_docs(docs, 6000, distributions) 
testset.to_pandas()

                                                                   Exception in thread Thread-90:
Traceback (most recent call last):
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1191, in __iter__
    self.update(n - last_print_n)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1242, in update
    self.refresh(lock_args=self.lock_args)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1347, in refresh
    self.display()
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 1495, in display
    self.sp(self.__str__() if msg is None else msg)
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-packages/tqdm/std.py", line 459, in print_status
    fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
  File "/Users/hadiibrahim/anaconda3/envs/temp-env/lib/python3.11/site-pac

In [None]:
df3=pd.DataFrame(testset.to_pandas())
df3.to_csv("testset.csv",index=False)

In [None]:
basic_qa_ragas_dataset = create_ragas_dataset(pd.DataFrame(testset.to_pandas()),"ground_truth")

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [03:50<00:00, 23.00s/it]


In [None]:
basic_qa_ragas_dataset.to_pandas()

Unnamed: 0,question,answer,contexts,ground_truths
0,What are the different functional areas within...,Based on the provided screenshots and descript...,[On the topleft there is a vertical navigation...,[The different functional areas within the app...
1,What can be accessed through the separate menu...,Through the separate menu in the Tulus system ...,[6. TULUS System management and settings\nThe...,[The tools needed for general management of th...
2,What is the purpose of the dropdown menus in t...,The purpose of the dropdown menus in the inter...,[The interface is divided into several section...,[The purpose of the dropdown menus in the inte...
3,What is the current status of the FPServiceBro...,The current status of the FPServiceBroker serv...,[The image appears to be a screenshot of a gra...,[The current status of the FPServiceBroker ser...
4,What is the purpose of the hierarchical struct...,The purpose of the hierarchical structure in t...,[On the top left corner there is a menu bar wi...,[The purpose of the hierarchical structure in ...
5,What does the license management tool do in th...,The license management tool in the software in...,[An activated license is valid only for the so...,[The license management tool in the software i...
6,What is the purpose of the main menu in the so...,The main menu in the software for laser operat...,[laser. 2 Play/Stop Button Start/stop a task l...,[The main menu in the software for laser opera...
7,What does the Cancel button do in the producti...,The Cancel button in the production order mana...,[At the bottom there are two buttons Add highl...,[The Cancel button allows the user to exit wit...
8,What is the purpose of MDA mode in the control...,"According to the text, MDA mode allows to run ...",[The center of the interface displays operatio...,[MDA mode allows for running utility programs ...
9,How does the hierarchical structure in the nav...,The hierarchical structure in the navigation p...,[On the top left corner there is a menu bar wi...,[The hierarchical structure in the navigation ...


In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [01:56<00:00,  1.66s/it]


In [None]:
basic_qa_result

{'context_precision': 0.9917, 'faithfulness': 0.7871, 'answer_relevancy': 0.9716, 'context_recall': 1.0000, 'context_relevancy': 0.0255, 'answer_correctness': 0.6130, 'answer_similarity': 0.9552}

In [None]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [02:33<00:00,  2.19s/it]


In [None]:
basic_qa_result

{'context_precision': 0.9833, 'faithfulness': 0.9583, 'answer_relevancy': 0.9720, 'context_recall': 0.9000, 'context_relevancy': 0.0325, 'answer_correctness': 0.6566, 'answer_similarity': 0.9558}

In [None]:
import os
import random
import numpy as np
import logging
import json
import pickle
from dotenv import load_dotenv
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceHub
from langchain_groq import ChatGroq
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity,
)
from langchain.retrievers.document_compressors import FlashrankRerank
from ragas.metrics.critique import harmfulness
from ragas import evaluate
from ragatouille import RAGPretrainedModel
from langchain.retrievers import ContextualCompressionRetriever

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ensure the correct forward reference
# FlashrankRerank.update_forward_refs(Ranker=RankerModel)

# Now, you can use the FlashrankRerank with the RankerModel
compressor = FlashrankRerank(model_name="ms-marco-MultiBERT-L-12")


In [None]:
class RAGModel:
    def __init__(
        self, docs, dataset, k, vector_store_type, reranking=False, seed=None
    ):
        load_dotenv()
        self.docs = docs
        self.dataset = dataset
        self.k = k
        self.vector_store_type = vector_store_type
        self.reranking = reranking
        self.huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
        self.groq_api_key = os.getenv("GROQ_API_KEY")
        self.vector_store = None
        self.embeddings = None
        self.retriever = None
        self.retrievalQA = None
        self.prompt_template = None
        self.seed = seed

        if seed is not None:
            self.set_seed(seed)

    def set_seed(self, seed):
        logger.info(f"Setting random seed: {seed}.")
        random.seed(seed)
        np.random.seed(seed)
        # Set seed for other libraries if necessary
        # Example for TensorFlow: tf.random.set_seed(seed)
        # Example for PyTorch: torch.manual_seed(seed)
        # If using CUDA: torch.cuda.manual_seed_all(seed)

    def setup_embeddings(self):
        logger.info("Setting up embeddings.")
        self.embeddings = OpenAIEmbeddings()

    def setup_vector_store(self):
        logger.info(f"Setting up vector store: {self.vector_store_type}.")
        if self.vector_store_type == "FAISS":
            self.vector_store = FAISS.from_documents(self.docs, self.embeddings)
        elif self.vector_store_type == "Chroma":
            self.vector_store = Chroma.from_documents(self.docs, self.embeddings)
        else:
            raise ValueError(f"Unsupported vector store type: {self.vector_store_type}")

    def setup_prompt_template(self):
        logger.info("Setting up prompt template.")
        self.prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template=""" Using the information contained in the context, give a comprehensive answer to the question.
            Respond only to the question asked, response should be concise and relevant to the question.
            If the answer cannot be deduced from the context, use your own knowledge.
            Given the following context: {context}, answer the question: {question}""",
        )

    def setup_retriever(self):
        logger.info("Setting up retriever.")
        self.retriever = self.vector_store.as_retriever(
            search_type="similarity", search_kwargs={"k": self.k}
        )

        if self.reranking_method == "flash":
            self.retriever = self.rerank_results()

    def setup_llm(self, name):
        logger.info(f"Setting up LLM: {name}.")
        self.llm = ChatGroq(groq_api_key=self.groq_api_key, model_name=name)

    def refine_query(self, original_query):
        logger.info("Refining query.")
        refined_query = self.llm.invoke(
            f"Rewrite the query for better retrieval: {original_query}"
        )
        return refined_query

    # def rerank_results_colbert(self, query, source_docs):
        
    #     docs = self.retriever.invoke(query)
    #     docs=[doc.page_content for doc in docs]
    #     # Integrate the reranker
    #     logger.info("Reranking results.")
    #     reranked_results = RERANKER.rerank(query, docs, self.k)
    #     return reranked_results[: self.k]

    def rerank_results(self,):
        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=self.retriever
        )
        return compression_retriever

    def setup_retrieverQA(self):
        logger.info("Setting up RetrievalQA chain.")
        self.retrievalQA = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.retriever,
            return_source_documents=True,
            chain_type_kwargs={"prompt": self.prompt_template},
        )

    def generate_rag_answer(self, query):
        logger.info("Refining the query {query}.")
        query = self.refine_query(query)
        logger.info(f"Refined query: {query}.")
        
        logger.info(f"Generating RAG answer for query: {query}.")
        result = self.retrievalQA.invoke({"query": query})
        answer = result["result"]
        source_documents = result["source_documents"]
        return answer, source_documents

    def create_ragas_dataset(self, column_name="ground_truth"):
        logger.info("Creating RAGAS dataset.")
        rag_dataset = []
        for index, row in tqdm(
            self.dataset.iterrows(),
            total=self.dataset.shape[0],
            desc="Processing questions",
        ):
            answer, source_docs = self.generate_rag_answer(row["question"])

            rag_dataset.append(
                {
                    "question": row["question"],
                    "answer": answer,
                    "contexts": [context.page_content for context in source_docs],
                    "ground_truths": [row[column_name]],
                }
            )
        rag_df = pd.DataFrame(rag_dataset)
        rag_eval_dataset = Dataset.from_pandas(rag_df)
        return rag_eval_dataset

    def evaluate(self, ragas_dataset):
        logger.info("Evaluating RAGAS dataset.")
        result = evaluate(
            ragas_dataset,
            metrics=[
                context_precision,
                faithfulness,
                answer_relevancy,
                context_recall,
                context_relevancy,
                answer_correctness,
                answer_similarity,
            ],
        )
        logger.info("Evaluation complete.")
        return result

    def test_set_generation(self):
        logger.info("Generating test set.")
        generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
        critic_llm = ChatOpenAI(model="gpt-4o")
        embeddings = OpenAIEmbeddings()
        generator = TestsetGenerator.from_langchain(
            generator_llm, critic_llm, embeddings
        )
        distributions = {simple: 0.5, multi_context: 0.4, reasoning: 0.1}
        testset = generator.generate_with_langchain_docs(self.docs, 10, distributions)
        logger.info("Test set generation complete.")
        return pd.DataFrame(testset.to_pandas())

    def save_results(self, results, filepath):
        logger.info(f"Saving results to {filepath}.")
        with open(filepath, "w") as file:
            json.dump(results, file, indent=4)

    def get_model_params(self):
        params = {
            "k": self.k,
            "vector_store_type": self.vector_store_type,
            "retrieval_method": self.retrieval_method,
            "seed": self.seed,
            "huggingface_token": self.huggingface_token,
            "groq_api_key": self.groq_api_key,
        }
        return params


# Main function for setup and testing
logger.info("Starting RAG model setup.")
pickle_file = "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/docs.pkl"
with open(pickle_file, "rb") as file:
    docs = pickle.load(file)
dataset = pd.read_csv(
    "/Users/hadiibrahim/Dev/prima-power-hmi-assistant/data/testset.csv"
)
model = RAGModel(
    docs,
    dataset,
    k=3,
    vector_store_type="FAISS",
    reranking=False,
    seed=42,
)
model.setup_embeddings()
model.setup_vector_store()
model.setup_prompt_template()
model.setup_retriever()
model.setup_llm("llama3-70b-8192")
model.setup_retrieverQA()

In [None]:
ragas_dataset = model.create_ragas_dataset()
evaluation_result = model.evaluate(ragas_dataset)
logger.info("RAG Model setup and evaluation complete.")
print(evaluation_result)

evaluation_result_with_params = {
    "evaluation_result": evaluation_result,
    "model_params": model.get_model_params(),
}
model.save_results(evaluation_result_with_params, "evaluation_results.json")

10