In [1]:
from datasets import load_dataset
from app.workflow.rag_workflow import RAGWorkflow
from app.core.config import settings as config
from qdrant_client import QdrantClient, models
from app.ingestion.pdf_loader.pdf_to_text import extract_text_from_pdf
from app.ingestion.ingest import split_documents, generate_embeddings, store_documents
from app.models.models import State
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from langchain.chat_models import init_chat_model
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

In [None]:
dataset = load_dataset("jamescalam/ai-arxiv2-chunks", split="train[:10000]")
dataset

In [None]:
dataset[1]

In [None]:
client = QdrantClient(config.qdrant_url, api_key=config.qdrant_api_key)

client.create_collection(
    collection_name="ragas-test",
    vectors_config={
        "dense": models.VectorParams(
            size=config.embeddings_dim, distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(modifier=models.Modifier.IDF)
    },
)

In [None]:
from tqdm.auto import tqdm
from qdrant_client import models
import uuid
from openai import OpenAI
import time

# Initialize OpenAI client for batch embeddings (more efficient than VectorDB for large batches)
embeddings_client = OpenAI(
    api_key=config.embeddings_api_key, base_url=config.embeddings_base_url
)

# Convert dataset to pandas dataframe for easier manipulation
data = dataset.to_pandas()

batch_size = 50  # Reduced batch size
embedding_batch_size = 10  # Much smaller batch for embeddings API to avoid 413 errors

print(f"Starting to upsert {len(data)} documents to Qdrant...")

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]

    # generate unique ids for each chunk
    ids = [str(uuid.uuid4()) for _ in range(len(batch))]

    # get text to embed
    texts = [x["chunk"] for _, x in batch.iterrows()]

    # generate embeddings in smaller sub-batches for efficiency
    embeddings = []
    for j in range(0, len(texts), embedding_batch_size):
        sub_texts = texts[j : j + embedding_batch_size]

        try:
            # Get embeddings for this sub-batch
            response = embeddings_client.embeddings.create(
                input=sub_texts, model=config.embeddings_model
            )

            # Extract embeddings from response
            batch_embeddings = [emb.embedding for emb in response.data]
            embeddings.extend(batch_embeddings)

            # Small delay to avoid rate limiting
            time.sleep(0.1)

        except Exception as e:
            print(f"Error getting embeddings for batch {j}: {e}")
            # Use zero embeddings as fallback
            batch_embeddings = [[0.0] * config.embeddings_dim for _ in sub_texts]
            embeddings.extend(batch_embeddings)

    # prepare points for Qdrant
    points = []
    for j, (text, embedding) in enumerate(zip(texts, embeddings)):
        row = batch.iloc[j]
        point = models.PointStruct(
            id=ids[j],  # Use UUID or other unique identifier
            payload={
                "text": row["chunk"],
                "source": row["source"],
                "title": row["title"],
                "metadata": {"source": row["source"], "title": row["title"]},
            },
            vector={"dense": embedding},  # Embedding is already a list from API
        )
        points.append(point)

    # upsert to Qdrant
    try:
        client.upsert(collection_name="ragas-test", points=points)
    except Exception as e:
        print(f"Error upserting batch {i}: {e}")

print(f"Successfully upserted {len(data)} documents to Qdrant!")

In [None]:
query = "What is the impact of encoding the input prompt on inference speed in generative inference?"
query_embedding_response = embeddings_client.embeddings.create(
    input=query, model=config.embeddings_model
)
query_vector = query_embedding_response.data[0].embedding
results = client.query_points(
    collection_name="ragas-test",
    query=query_vector,
    limit=3,
    using="dense",
)
print(results)

In [None]:
# CORRECTED: Test query with proper embedding extraction
query = "What is the impact of encoding the input prompt on inference speed in generative inference?"

# Get the embedding for the query
query_embedding_response = embeddings_client.embeddings.create(
    input=query, model=config.embeddings_model
)

# Extract the actual embedding vector from the response
query_vector = query_embedding_response.data[0].embedding

print(f"Query vector type: {type(query_vector)}")
print(f"Query vector length: {len(query_vector)}")

# Query Qdrant with the embedding vector
results = client.query_points(
    collection_name="ragas-test",
    query=query_vector,  # Use the actual vector, not the response object
    limit=3,
    using="dense",
)

print(f"\nFound {len(results.points)} results:")
for i, result in enumerate(results.points):
    print(f"\nResult {i+1} (Score: {result.score:.4f}):")
    print(f"Text: {result.payload['text'][:200]}...")
    print(f"Source: {result.payload['source']}")
    print(f"Title: {result.payload['title']}")

In [None]:
ragas_data = load_dataset("aurelio-ai/ai-arxiv2-ragas-mixtral", split="train")
ragas_data

In [None]:
ragas_data[0]

In [None]:
# Fix the collection name issue and re-run evaluation
print("=== CORRECTED RAG EVALUATION ===")

# Import needed classes
from app.db.vector_db import VectorDB

# Create a temporary config with the correct collection name
import copy

temp_config = copy.deepcopy(config)
temp_config.qdrant_collection_name = "ragas-test"

# Initialize workflow with corrected config
corrected_workflow = RAGWorkflow()
corrected_workflow.config = temp_config
corrected_workflow.vector_db = VectorDB(
    temp_config
)  # Re-initialize with correct collection

corrected_graph = corrected_workflow.build()

# Create new DataFrame for corrected results
df_corrected = pd.DataFrame(
    {"question": [], "contexts": [], "answer": [], "ground_truth": []}
)

limit = 3  # Start even smaller for testing

print(f"Starting corrected evaluation of {limit} questions...")

for i, row in tqdm(enumerate(ragas_data), total=limit):
    if i >= limit:
        break

    question = row["question"]
    ground_truth = row["ground_truth"]

    try:
        # Use corrected RAG workflow
        config_dict = {"configurable": {"thread_id": f"corrected_eval_{i}"}}
        result = corrected_graph.invoke({"question": question}, config=config_dict)

        answer = result["answer"]

        # Extract contexts from the retrieved documents
        contexts = []
        if result["context"]:
            for doc in result["context"]:
                if hasattr(doc, "text"):
                    contexts.append(doc.text)
                else:
                    contexts.append(str(doc))

        print(f"\nQ{i+1}: {question[:80]}...")
        print(f"Found {len(contexts)} contexts")
        print(f"A{i+1}: {answer[:80]}...")

    except Exception as e:
        print(f"Error processing question {i+1}: {e}")
        answer = "ERROR"
        contexts = []

    # Add to DataFrame
    new_row = pd.DataFrame(
        {
            "question": [question],
            "answer": [answer],
            "contexts": [contexts],
            "ground_truth": [ground_truth],
        }
    )

    df_corrected = pd.concat([df_corrected, new_row], ignore_index=True)

print(f"\nCorrected evaluation complete!")
print(
    f"Questions with contexts found: {len([c for c in df_corrected['contexts'] if len(c) > 0])}"
)
print(
    f"Questions with no contexts: {len([c for c in df_corrected['contexts'] if len(c) == 0])}"
)

df_corrected

In [None]:
# Now evaluate using RAGAS metrics
print("=== RAGAS EVALUATION ===")

# Prepare the dataset for RAGAS evaluation
# Convert our DataFrame to the format expected by RAGAS
evaluation_data = []

for _, row in df_corrected.iterrows():
    evaluation_data.append(
        {
            "user_input": row["question"],
            "retrieved_contexts": row["contexts"],  # List of context strings
            "response": row["answer"],
            "reference": (
                row["ground_truth"][0]
                if isinstance(row["ground_truth"], list)
                else row["ground_truth"]
            ),
        }
    )

print(f"Prepared {len(evaluation_data)} samples for RAGAS evaluation")

# Create RAGAS evaluation dataset
evaluation_dataset = EvaluationDataset.from_list(evaluation_data)

# Initialize LLM for evaluation
llm = init_chat_model(
    "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", model_provider="together"
)
evaluator_llm = LangchainLLMWrapper(llm)

print(
    "Starting RAGAS evaluation with metrics: LLMContextRecall, Faithfulness, FactualCorrectness"
)

# Run RAGAS evaluation
try:
    result = evaluate(
        dataset=evaluation_dataset,
        metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
        llm=evaluator_llm,
    )

    print("\n=== RAGAS EVALUATION RESULTS ===")
    print(result)

    # Display metrics summary
    if hasattr(result, "scores"):
        print("\n=== METRICS SUMMARY ===")
        for metric, score in result.scores.items():
            print(f"{metric}: {score:.4f}")

except Exception as e:
    print(f"Error during RAGAS evaluation: {e}")
    print("This might be due to API rate limits or configuration issues.")

    # Show what we have so far
    print("\n=== EVALUATION DATA PREVIEW ===")
    for i, item in enumerate(evaluation_data):
        print(f"\nSample {i+1}:")
        print(f"Question: {item['user_input'][:100]}...")
        print(f"Response: {item['response'][:100]}...")
        print(f"Contexts: {len(item['retrieved_contexts'])} documents")
        print(f"Reference: {item['reference'][:100]}...")