In [1]:
import os
os.chdir('../')

In [86]:
import os
import time
import json
import uuid
from urllib.parse import urlparse
from pathlib import Path

import pg8000
import mlflow
import dagshub
import nest_asyncio
import pandas as pd
import sqlalchemy
from dotenv import load_dotenv
from google.cloud.sql.connector import Connector, IPTypes
from sqlalchemy import text
from datasets import Dataset 

from langgraph.graph.state import CompiledStateGraph
from langchain_community.vectorstores import FAISS
from langfuse.callback import CallbackHandler
from langchain_groq import ChatGroq
from langchain.document_loaders import DataFrameLoader
from langchain_huggingface import HuggingFaceEmbeddings

from src.graph import create_graph
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

# Apply asyncio patch for compatibility
nest_asyncio.apply()

In [84]:
EVAL_PARENT_DIR = "evaluation/"
EVAL_TESTSET_DIR = f"{EVAL_PARENT_DIR}/testset"
EVAL_RESULTS_DIR = f"{EVAL_PARENT_DIR}/results"
EVAL_METRICS_DIR = f"{EVAL_PARENT_DIR}/metrics"

In [4]:
# Define the directory paths
cache_dir = Path("cache")
faiss_dir = cache_dir / "faiss"
meta_dir = cache_dir / "meta"

In [29]:
load_dotenv()

True

In [85]:
## load the API Keys
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")
os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## Langfusea
os.environ['LANGFUSE_PUBLIC_KEY']=os.getenv("LANGFUSE_PUBLIC_KEY")
os.environ['LANGFUSE_SECRET_KEY']=os.getenv("LANGFUSE_SECRET_KEY")
os.environ['LANGFUSE_HOST']=os.getenv("LANGFUSE_HOST")

## Postgres DB
instance_connection_name = os.getenv("INSTANCE_CONNECTION_NAME")
db_user = os.getenv("DB_USER")  
db_pass = os.getenv("DB_PASS")  
db_name = os.getenv("DB_NAME")  

# MlFlow
os.environ["MLFLOW_TRACKING_URI"]=os.getenv("MLFLOW_TRACKING_URI")
os.environ["MLFLOW_TRACKING_USERNAME"]=os.getenv("MLFLOW_TRACKING_USERNAME")
os.environ["MLFLOW_TRACKING_PASSWORD"]=os.getenv("MLFLOW_TRACKING_PASSWORD")

## Import RAG Test Questions

In [8]:
# List all Parquet files in the directory
parquet_files = [f for f in os.listdir(EVAL_TESTSET_DIR) if f.endswith('.parquet')]

with open('artifact/product_uuids.json', 'r') as json_file:
    asin_uuid_map = json.load(json_file)

# Convert UUID-ASIN dictionary to reverse lookup (UUID as key, ASIN as value)
uuid_asin_map = {v: k for k, v in asin_uuid_map.items()}

# Read and combine DataFrames
def process_parquet(f):
    df = pd.read_parquet(os.path.join(EVAL_TESTSET_DIR, f))
    df['file_hash'] = f.split('.')[0]
    df['parent_asin'] = df['file_hash'].map(uuid_asin_map)
    return df

dfs = [process_parquet(f) for f in parquet_files]
test_df = pd.concat(dfs, ignore_index=True)

In [10]:
# evolution_type Coverage - % of each type in test & in full
def get_percent_coverage(df):
    etype_count_df = pd.DataFrame(df['evolution_type'].value_counts()).reset_index()
    etype_count_df['total'] = df.shape[0]
    etype_count_df['percent'] = (100 * etype_count_df['count'])/etype_count_df['total']
    return etype_count_df

etype_count_df_test = get_percent_coverage(test_df)
etype_count_df_test

Unnamed: 0,evolution_type,count,total,percent
0,simple,55,83,66.26506
1,reasoning,25,83,30.120482
2,multi_context,3,83,3.614458


## Import the Graph

In [11]:
def connect_with_db() -> sqlalchemy.engine.base.Engine:
    ip_type = IPTypes.PRIVATE if os.getenv("PRIVATE_IP") else IPTypes.PUBLIC
    connector = Connector()
    def getconn() -> pg8000.dbapi.Connection:
        conn: pg8000.dbapi.Connection = connector.connect(
            instance_connection_name,
            "pg8000",
            user=db_user,
            password=db_pass,
            db=db_name,
            ip_type=ip_type,
        )
        return conn
    pool = sqlalchemy.create_engine(
        "postgresql+pg8000://",
        creator=getconn,
    )
    return pool

In [12]:
engine = connect_with_db()

In [13]:
def load_product_data(asin: str):
    with engine.begin() as connection:
        try:
            # Fetch reviews
            review_query = text(f"""
                SELECT parent_asin, asin, helpful_vote, timestamp, verified_purchase, title, text
                FROM userreviews ur 
                WHERE ur.parent_asin = '{asin}';
            """)
            review_result = connection.execute(review_query)
            review_df = pd.DataFrame(review_result.fetchall(), columns=review_result.keys())
            
            # Fetch metadata
            meta_query = text(f"""
                SELECT parent_asin, main_category, title, average_rating, rating_number, features, description, price, store, categories, details
                FROM metadata md 
                WHERE md.parent_asin = '{asin}';
            """)
            meta_result = connection.execute(meta_query)
            meta_df = pd.DataFrame(meta_result.fetchall(), columns=meta_result.keys())
            
        except Exception as e:
            print("Exception: {}".format(e))

    return review_df, meta_df

In [14]:
def create_vector_store(review_df):
    review_df = review_df[review_df['text'].notna()]
    loader = DataFrameLoader(review_df)
    review_docs = loader.load()

    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(documents=review_docs, embedding=embeddings)
    return vectordb

In [15]:
agent: CompiledStateGraph = create_graph(isMemory=False)

## Generate Results

In [17]:
vector_store_cache = []
results = []

In [18]:
test_df['answer'] = ''

In [33]:
for index, row in test_df.iterrows():
    cache_key = f"{row['file_hash']}-{row['parent_asin']}"
    
    if cache_key not in vector_store_cache:
        review_df, meta_df = load_product_data(row['parent_asin'])
        vector_db = create_vector_store(review_df)
        vector_db.save_local(f"{faiss_dir}/{cache_key}")
        meta_df.to_csv(f"{meta_dir}/{cache_key}.csv", index=False)
        vector_store_cache.append(cache_key)

    retriever = faiss_dir / cache_key
    meta_df = meta_dir / f"{cache_key}.csv"

    config = {}

    run_id = str(uuid.uuid4())
    langfuse_handler = CallbackHandler(
        user_id=f"Model-Evaluation-1",
        session_id=f"{cache_key}"
    )
    config.update({"callbacks": [langfuse_handler], "run_id": run_id})


    try:
        response = agent.invoke({
            "question": row['question'], 
            "meta_data": str(meta_df),
            "retriever": str(retriever)
        }, config=config)
        print(f"Response generated for query: {row['question']}")
        print(f"Response generated for id: {index}")
        
        test_df.at[index, 'answer'] = response['answer'].content
        time.sleep(2)
    except Exception as e:
        row['answer']=''
        print(f"Error invoking agent for Index: {index} - {e}")

[2024-11-15 14:20:09,349: INFO: SentenceTransformer: Use pytorch device_name: mps]
[2024-11-15 14:20:09,349: INFO: SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2]
Response generated for query: What has been your experience with the Dirt Devil over the two years of use?
Response generated for id: 0
Response generated for query: What type of game is suitable for a family gathering?
Response generated for id: 1
[2024-11-15 14:20:21,286: INFO: SentenceTransformer: Use pytorch device_name: mps]
[2024-11-15 14:20:21,287: INFO: SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L6-v2]
Response generated for query: What type of game is mentioned as being similar to Gin Rummy?
Response generated for id: 2
Response generated for query: Any game recommendations for Gin Rummy fans?
Response generated for id: 3
Response generated for query: What type of game can be played with grand-kids that is easy for youngsters but challenging for all?
Response gene

In [64]:
test_df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,file_hash,parent_asin,answer
0,What has been your experience with the Dirt De...,[We just retired our dirt devil after almost 2...,My experience with the Dirt Devil over the two...,simple,36a3c3df-563c-412a-b2a2-a317e1f96f3f,B000050B3H,"Based on the reviews provided, I found one use..."
1,What type of game is suitable for a family gat...,[This is a fun game whether with your family o...,The game described is suitable for a family ga...,simple,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,"Based on the context provided (none), I don't ..."
2,What type of game is mentioned as being simila...,[If you like playing Gin Rummy you’ll love thi...,The answer to given question is not present in...,simple,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,The game mentioned as being similar to Gin Rum...
3,Any game recommendations for Gin Rummy fans?,[If you like playing Gin Rummy you’ll love thi...,The answer to given question is not present in...,reasoning,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,"Based on the product information available, I ..."
4,What type of game can be played with grand-kid...,[Fun game for even or odd number of players. ...,The game that can be played with grand-kids th...,simple,fe4ac26f-2ed9-4911-9134-77338a775a02,1933054395,"Since the context is empty, I'll provide a gen..."


In [None]:
test_df['contexts'] = test_df['contexts'].apply(lambda x: [x])
test_dataset = Dataset.from_pandas(test_df)

In [66]:
result = evaluate(
    test_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

Evaluating:   1%|          | 2/332 [00:04<11:55,  2.17s/it]



Evaluating:  36%|███▌      | 118/332 [00:47<01:47,  1.98it/s]



Evaluating:  43%|████▎     | 144/332 [00:57<00:55,  3.36it/s]



Evaluating:  46%|████▌     | 152/332 [01:00<01:01,  2.91it/s]



Evaluating:  52%|█████▏    | 171/332 [01:05<00:54,  2.96it/s]



Evaluating:  82%|████████▏ | 271/332 [01:38<00:10,  6.04it/s]



Evaluating: 100%|██████████| 332/332 [02:21<00:00,  2.35it/s]


In [67]:
results_df = result.to_pandas()
results_df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,file_hash,parent_asin,answer,context_precision,faithfulness,answer_relevancy,context_recall
0,What has been your experience with the Dirt De...,[We just retired our dirt devil after almost 2...,My experience with the Dirt Devil over the two...,simple,36a3c3df-563c-412a-b2a2-a317e1f96f3f,B000050B3H,"Based on the reviews provided, I found one use...",1.0,0.666667,0.0,1.0
1,What type of game is suitable for a family gat...,[This is a fun game whether with your family o...,The game described is suitable for a family ga...,simple,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,"Based on the context provided (none), I don't ...",1.0,0.0,0.0,1.0
2,What type of game is mentioned as being simila...,[If you like playing Gin Rummy you’ll love thi...,The answer to given question is not present in...,simple,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,The game mentioned as being similar to Gin Rum...,0.0,0.333333,0.992131,0.0
3,Any game recommendations for Gin Rummy fans?,[If you like playing Gin Rummy you’ll love thi...,The answer to given question is not present in...,reasoning,5353bd53-9b26-4e21-8a59-ef55cd8a09d2,B00000IV35,"Based on the product information available, I ...",0.0,0.166667,0.0,0.0
4,What type of game can be played with grand-kid...,[Fun game for even or odd number of players. ...,The game that can be played with grand-kids th...,simple,fe4ac26f-2ed9-4911-9134-77338a775a02,1933054395,"Since the context is empty, I'll provide a gen...",1.0,0.052632,0.0,1.0


In [68]:
# Assuming results_df is your DataFrame
columns_of_interest = ['context_precision', 'faithfulness', 'answer_relevancy', 'context_recall']

# Calculate the min, max, median, and mean for each column
statistics = results_df[columns_of_interest].agg(['min', 'max', 'median', 'mean'])
print(statistics)


        context_precision  faithfulness  answer_relevancy  context_recall
min              0.000000      0.000000          0.000000        0.000000
max              1.000000      1.000000          1.000000        1.000000
median           1.000000      0.333333          0.936512        1.000000
mean             0.746988      0.393843          0.689093        0.808434


In [None]:
grouped_means = results_df.groupby('evolution_type')[columns_of_interest].mean()
grouped_means

Unnamed: 0_level_0,context_precision,faithfulness,answer_relevancy,context_recall
evolution_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
multi_context,0.666667,0.399912,0.951992,0.766667
reasoning,0.72,0.414901,0.752246,0.826667
simple,0.763636,0.384584,0.646047,0.802424


### Saving the results

In [72]:
test_df.to_parquet(f'{EVAL_RESULTS_DIR}/version0.parquet')

In [73]:
results_df.to_parquet(f'{EVAL_RESULTS_DIR}/version0-results.parquet')

In [80]:
metrics_dict = grouped_means.to_dict()

# Save the dictionary to a JSON file
with open(f"{EVAL_PARENT_DIR}/metrics/version0-metrics.json", "w") as json_file:
    json.dump(metrics_dict, json_file, indent=4)

## Log to MLFlow

In [77]:
mlflow.models.set_model(agent)

In [None]:
dagshub.init(repo_owner='eCom-dev5', repo_name='eCom-Chatbot', mlflow=True)
mlflow.set_registry_uri("https://dagshub.com/eCom-dev5/eCom-Chatbot.mlflow")
tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

with mlflow.start_run():
  if tracking_url_type_store != "file":
    mlflow.langchain.log_model(
      lc_model="src/graph.py", # Path to our model Python file
      artifact_path="version0",
      pip_requirements="requirements.txt",
      registered_model_name="verta-gpt40m-llama3.18b-llama3.170b-version0"
    )
  else:
    mlflow.langchain.log_model(
      lc_model="src/graph.py", # Path to our model Python file
      artifact_path="version0",
      pip_requirements="requirements.txt",
      registered_model_name="verta-gpt40m-llama3.18b-llama3.170b-version0"
    )

  mlflow.log_param(
    'Supervisor Prompt',
    ''''
    You are an efficient supervisor responsible for overseeing a conversation between the following agents: {members}. 

    If you got response from the Agent (response given below as "Generated Answer from the Agents:"), respond with 'FINISH' to move on to next step. 
    
    Based on the user's request, decide which agent should respond next. Each agent will complete a task and return their result. 
    
    There are two agents working alongside you:
        - Metadata: This agent has all metadata information about that product. 
        - Review-Vectorstore: This is a FAISS Vectorstore db containing documents related to all the user reviews for that product.
    
    If you got unsatisfied response from the Agents (Agent Throwing Errors like: "Metadata: Unable to generate result") ONLY THEN Call an Agent a **MAXIMUM of TWO TIMES** before responding with 'FINISH'.
    Once sufficient information is obtained from the Agents, respond with 'FINISH', after which Alpha, the final assistant, will provide the concluding guidance to the user.
    If the query is generic (Hello, How are you, etc) then route it to Alpha and respond with 'FINISH.' 

    If you got satisfactory response from the Agent (response given above), respond with 'FINISH' to move on to next step. 
    '''
  )

  mlflow.log_param(
    'Final Node Prompt',
    f'''
    You are Alpha, a highly knowledgeable and efficient chatbot assistant designed to help users with questions related to products.
    Your primary role is to assist users by providing concise, accurate, and insightful responses based on the product information and reviews available to you.
    If you don’t have the necessary information to answer the question, simply say that you don’t know.

    There are two agents working alongside you:
    - Metadata: This agent provides answers related to a product. It has all the information about that product.
    - Review-Vectorstore: This is a FAISS Vectorstore db containing documents related to all the user reviews for one product.
    
    When a User (Shopper) comes to you for help, the question might have first been routed through either the Metadata or the Review-Vectorstore. 

    Your primary objective is to offer clear, concise, and helpful advice to the teacher, ensuring that they receive the most accurate and useful information to support their shopping needs.

    Instructions:
    - Analyze the product information and/or reviews provided.
    - Provide brief, clear, and helpful answers to user queries about the product.
    - Focus on delivering concise and actionable insights to help users make informed decisions.

    The responses from those agents are available to you, and if their answers were incomplete or unsatisfactory, you will find this reflected in the context field. 
    Your job is to analyze their responses, determine if they are adequate, and provide additional guidance or clarification where needed.
    Below is the context from one of the agents:
    '''
  )

  mlflow.log_param(
    'Follow-up Node Prompt',
    '''
    Given the following:
    User Question: {question}
    Answer: {answer}
    Context: {context}
    Please generate three possible follow-up questions that the user might ask, each on a new line, without any numbering or bullet points. Do not include any explanations—just list the follow-up questions.
    Format them like this:
    question1\nquestion2\nquestion3
    '''
  )
  
  for evolution_type, row in grouped_means.iterrows():
    for metric_name, metric_value in row.items():
        # Construct a unique metric name with the evolution type
        metric_name_with_type = f"{evolution_type}_{metric_name}"
        # Log the metric
        mlflow.log_metric(metric_name_with_type, metric_value)



[2024-11-15 16:01:51,425: INFO: helpers: Initialized MLflow to track repo "eCom-dev5/eCom-Chatbot"]


[2024-11-15 16:01:51,426: INFO: helpers: Repository eCom-dev5/eCom-Chatbot initialized!]
https


Successfully registered model 'verta-gpt40m-llama3.18b-llama3.170b-version0'.
2024/11/15 16:01:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: verta-gpt40m-llama3.18b-llama3.170b-version0, version 1
Created version '1' of model 'verta-gpt40m-llama3.18b-llama3.170b-version0'.
2024/11/15 16:02:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run skittish-midge-312 at: https://dagshub.com/eCom-dev5/eCom-Chatbot.mlflow/#/experiments/0/runs/8e1e8126ed744d77bced6ee8faf2fdc2.
2024/11/15 16:02:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/eCom-dev5/eCom-Chatbot.mlflow/#/experiments/0.
