# Imports

In [193]:
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
import faiss
import pandas as pd
import numpy as np
import pickle
import time
from tqdm import tqdm

# Intialization

In [185]:
# Load the FAISS index
index = faiss.read_index("database/pdf_sections_index.faiss")

In [186]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [187]:
with open('database/pdf_sections_data.pkl', 'rb') as f:
        sections_data = pickle.load(f)

# RAG functions

In [330]:
def search_faiss(query, k=3):
    query_vector = model.encode([query])[0].astype('float32')
    query_vector = np.expand_dims(query_vector, axis=0)
    distances, indices = index.search(query_vector, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'distance': dist,
            'content': sections_data[idx]['content'],
            'metadata': sections_data[idx]['metadata']
        })
    
    return results

In [331]:
# Create a prompt template
prompt_template = """
You are an AI assistant specialized in dietary guidelines. 
Use the following pieces of context to answer the question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def answer_question(query):
    # Search for relevant context
    search_results = search_faiss(query)
    
    # Combine the content from the search results
    context = "\n\n".join([result['content'] for result in search_results])

    # Run the chain
    response = chain.run(context=context, question=query)
    
    return response

# Reading GT

In [239]:
df = pd.read_csv('data/QA_pairs.csv')

In [240]:
time_list=[]
response_list=[]
for i in tqdm(range(len(df))):
    query = df['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [01:13<00:00,  7.35s/it]


In [241]:
df['latency'] = time_list
df['response'] = response_list

# Evaluation

In [242]:
eval_llm = Ollama(
    model="phi3"
)

In [None]:
metrics = ['correctness', 'relevance', 'coherence', 'conciseness']

In [243]:
for metric in metrics:
    evaluator = load_evaluator("labeled_criteria", criteria=metric, llm=eval_llm)
    
    reasoning = []
    value = []
    score = []
    
    for i in tqdm(range(len(df))):
        eval_result = evaluator.evaluate_strings(
            prediction=df.response.values[i],
            input=df.Questions.values[i],
            reference=df.Answers.values[i]
        )
        reasoning.append(eval_result['reasoning'])
        value.append(eval_result['value'])
        score.append(eval_result['score'])
    
    df[metric+'_reasoning'] = reasoning
    df[metric+'_value'] = value
    df[metric+'_score'] = score 

100%|███████████████████████████████████████████| 10/10 [01:11<00:00,  7.18s/it]
100%|███████████████████████████████████████████| 10/10 [00:50<00:00,  5.02s/it]
100%|███████████████████████████████████████████| 10/10 [00:49<00:00,  4.94s/it]
100%|███████████████████████████████████████████| 10/10 [03:02<00:00, 18.29s/it]


In [266]:
df[['correctness_score','relevance_score','coherence_score','conciseness_score','latency']].mean()

correctness_score    1.000000
relevance_score      0.800000
coherence_score      0.800000
conciseness_score    0.888889
latency              7.344954
dtype: float64

In [329]:
irr_q=pd.read_csv('data/irrelevant_questions.csv')

In [332]:
time_list=[]
response_list=[]
for i in tqdm(range(len(irr_q))):
    query = irr_q['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [00:55<00:00,  5.50s/it]


In [333]:
irr_q['response']=response_list
irr_q['latency']=time_list

In [334]:
irr_q['irrelevant_score'] = irr_q['response'].str.contains("I don't know")

In [335]:
irr_q[['irrelevant_score','latency']].mean()

irrelevant_score    1.000000
latency             5.498948
dtype: float64

# Improvement

In [259]:
new_prompt_template = """
You are an AI assistant specialized in dietary guidelines. 
Use the following pieces of context to answer the question concisely. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=new_prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def answer_question_new(query):
    # Search for relevant context
    search_results = search_faiss(query)
    
    # Combine the content from the search results
    context = "\n\n".join([result['content'] for result in search_results])

    # Run the chain
    response = chain.run(context=context, question=query)
    
    return response

In [263]:
df2=df.copy()

In [261]:
time_list=[]
response_list=[]
for i in tqdm(range(len(df2))):
    query = df2['Questions'].values[i]
    start = time.time()
    response = answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [01:04<00:00,  6.46s/it]


In [264]:
df2['latency'] = time_list
df2['response'] = response_list

In [265]:
for metric in metrics:
    evaluator = load_evaluator("labeled_criteria", criteria=metric, llm=eval_llm)
    
    reasoning = []
    value = []
    score = []
    
    for i in tqdm(range(len(df2))):
        eval_result = evaluator.evaluate_strings(
            prediction=df2.response.values[i],
            input=df2.Questions.values[i],
            reference=df2.Answers.values[i]
        )
        reasoning.append(eval_result['reasoning'])
        value.append(eval_result['value'])
        score.append(eval_result['score'])
    
    df2[metric+'_reasoning'] = reasoning
    df2[metric+'_value'] = value
    df2[metric+'_score'] = score 

100%|███████████████████████████████████████████| 10/10 [00:55<00:00,  5.54s/it]
100%|███████████████████████████████████████████| 10/10 [00:56<00:00,  5.61s/it]
100%|███████████████████████████████████████████| 10/10 [00:34<00:00,  3.47s/it]
100%|███████████████████████████████████████████| 10/10 [00:37<00:00,  3.74s/it]


In [268]:
df2[['correctness_score','relevance_score','coherence_score','conciseness_score','latency']].mean()

correctness_score    1.000000
relevance_score      1.000000
coherence_score      1.000000
conciseness_score    1.000000
latency              6.461154
dtype: float64

# Query relevance

In [314]:
def new_search_faiss(query, k=3, threshold=0.5):
    query_vector = model.encode([query])[0].astype('float32')
    query_vector = np.expand_dims(query_vector, axis=0)
    distances, indices = index.search(query_vector, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        if dist < threshold:  # Only include results within the threshold distance
            results.append({
                'distance': dist,
                'content': sections_data[idx]['content'],
                'metadata': sections_data[idx]['metadata']
            })
    
    return results

In [315]:
new_prompt_template = """
You are an AI assistant specialized in dietary guidelines. 
Use the following pieces of context to answer the question concisely. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}

Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

llm = Ollama(
    model="llama3"
)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

def new_answer_question(query):
    # Search for relevant context
    search_results = new_search_faiss(query)
    
    if result==[]:
        response="I don't know"
    else:
        context = "\n\n".join([result['content'] for result in search_results])
        response = chain.run(context=context, question=query)
    
    return response

In [316]:
irr_q2=irr_q.copy()

In [317]:
time_list=[]
response_list=[]
for i in tqdm(range(len(irr_q2))):
    query = irr_q['Questions'].values[i]
    start = time.time()
    response = new_answer_question(query)
    end = time.time()   
    time_list.append(end-start)
    response_list.append(response)

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 62.71it/s]


In [326]:
irr_q2['response']=response_list
irr_q2['latency']=time_list

In [327]:
irr_q2['irrelevant_score'] = irr_q2['response'].str.contains("I don't know")

In [328]:
irr_q2[['irrelevant_score','latency']].mean()

irrelevant_score    1.000000
latency             0.015869
dtype: float64