# Golden Datasets

## 1. Setup

### 1.1 Install & Import

In [None]:
!pip install -r requirements.txt

In [32]:
import chromadb
import pandas as pd
import os
from dotenv import load_dotenv
from tqdm import tqdm
import json
import matplotlib.pyplot as plt
import datasets
from sentence_transformers import SentenceTransformer, util
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
import threading
from typing import List
import numpy as np
import requests
import tiktoken
from voyageai import Client as VoyageClient
from openai import OpenAI as OpenAIClient
import torch
from anthropic import Anthropic as AnthropicClient
from llm_calls import generate_query, generate_query_with_example
from utils import *
from embedding_funcs import *
from chroma_funcs import *
from evaluation_funcs import *

### 1.2 Load API Keys

In [None]:
# Chroma Cloud
CHROMA_TENANT = "YOUR CHROMA TENANT ID"
X_CHROMA_TOKEN = "YOUR CHROMA API KEY"

# Embedding Model
OPENAI_API_KEY = "YOUR OPENAI API KEY"

# LLM
CLAUDE_API_KEY = "YOUR CLAUDE API KEY"

### 1.3 Set Clients

In [None]:
chroma_client = chromadb.HttpClient(
  ssl=True,
  host='api.trychroma.com',
  tenant=CHROMA_TENANT,
  database='golden-datasets-replication',
  headers={
    'x-chroma-token': X_CHROMA_TOKEN
  }
)

openai_client = OpenAIClient(api_key=OPENAI_API_KEY)
claude_client = AnthropicClient(api_key=CLAUDE_API_KEY)

### 1.4 Load Data

In [21]:
wiki_queries = datasets.load_dataset("ellamind/wikipedia-2023-11-retrieval-multilingual-queries", "en")["test"].to_pandas()
wiki_corpus = datasets.load_dataset("ellamind/wikipedia-2023-11-retrieval-multilingual-corpus", "en")["test"].to_pandas()
wiki_qrels = datasets.load_dataset("ellamind/wikipedia-2023-11-retrieval-multilingual-qrels", "en")["test"].to_pandas()

In [22]:
wiki_qrels = wiki_qrels[wiki_qrels["score"] == 1.0]

wiki_qrels = combined_datasets_dataframes(wiki_queries, wiki_corpus, wiki_qrels)

## 2. Embed Corpus & Store in Chroma

In [25]:
wiki_corpus_ids = wiki_corpus["_id"].tolist()
wiki_corpus_texts = wiki_corpus["text"].tolist()

In [None]:
wiki_collection = chroma_client.get_or_create_collection(
    name="wiki-text-embedding-3-small",
    metadata={"hnsw:space": "cosine"}
)

wiki_corpus_embeddings = openai_embed_in_batches(openai_client, wiki_corpus_texts, "text-embedding-3-small")

collection_add_in_batches(wiki_collection, wiki_corpus_ids, wiki_corpus_texts, wiki_corpus_embeddings)

wiki_corpus_lookup = {
    id: {
        "text": text,
        "embedding": embedding
    } for id, text, embedding in zip(wiki_corpus_ids, wiki_corpus_texts, wiki_corpus_embeddings)
}

## 3. Simple Query Generation

Demonstration that models have memorized public benchmarks

In [None]:
wiki_simple_generated_queries = []

for _, row in tqdm(wiki_qrels.iterrows(), total=len(wiki_qrels), desc="Generating simple queries..."):
    corpus = row['corpus-text']
    generated_query = generate_query(claude_client, corpus)
    wiki_simple_generated_queries.append(generated_query)

wiki_qrels["simple-generated-query-text"] = wiki_simple_generated_queries

In [None]:
wiki_original_queries = wiki_qrels["query-text"].tolist()

wiki_original_query_embeddings = openai_embed_in_batches(openai_client, wiki_original_queries, "text-embedding-3-small")
wiki_simple_generated_query_embeddings = openai_embed_in_batches(openai_client, wiki_simple_generated_queries, "text-embedding-3-small")

wiki_original_query_embeddings_lookup = {
    id: {
        "text": text,
        "embedding": embedding
    } for id, text, embedding in zip(wiki_corpus_ids, wiki_corpus_texts, wiki_original_query_embeddings)
}
wiki_simple_generated_query_embeddings_lookup = {
    id: {
        "text": text,
        "embedding": embedding
    } for id, text, embedding in zip(wiki_corpus_ids, wiki_corpus_texts, wiki_simple_generated_query_embeddings)
}

In [None]:
wiki_query_query_scores = score_query_query(wiki_qrels, wiki_original_query_embeddings_lookup, wiki_simple_generated_query_embeddings_lookup)
wiki_query_query_scores.head()

In [None]:
plt.figure(figsize=(8, 5))

plt.hist(wiki_query_query_scores["query-query-score"], bins=30, alpha=0.5,  edgecolor='black', label="Score", range=(0, 1), density=True)

plt.xlabel("Cosine Similarity")
plt.ylabel("Normalized Frequency")
plt.title("text-embedding-3-small")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
wiki_query_query_scores.sort_values(by="query-query-score", ascending=False, inplace=True)

for i, row in wiki_query_query_scores.head(10).iterrows():
    print(f"Score: {row['query-query-score']:.4f}")
    print(f"Original Query: {row['query-text']}")
    print(f"Generated Query: {row['simple-generated-query-text']}")
    print("-" * 80)

## 4. Distinct Query Generation

In [None]:
wiki_distinct_generated_queries = []

for _, row in tqdm(wiki_qrels.iterrows(), total=len(wiki_qrels), desc="Generating distinct queries..."):
    query = row['query_text']
    corpus = row['corpus_text']
    generated_query = generate_query_with_example(claude_client, query, corpus)
    wiki_distinct_generated_queries.append(generated_query)

wiki_qrels["distinct-generated-query-text"] = wiki_distinct_generated_queries

In [None]:
wiki_distinct_generated_query_embeddings = openai_embed_in_batches(openai_client, wiki_distinct_generated_queries, "text-embedding-3-small")

wiki_distinct_generated_query_embeddings_lookup = {
    id: {
        "text": text,
        "embedding": embedding
    } for id, text, embedding in zip(wiki_corpus_ids, wiki_corpus_texts, wiki_distinct_generated_query_embeddings)
}

In [None]:
k_values = [1, 3, 5, 10]

wiki_distinct_gen_results = get_results(wiki_collection, wiki_distinct_generated_queries, wiki_corpus_ids, wiki_distinct_generated_query_embeddings)
wiki_distinct_gen_metrics = evaluate(k_values, wiki_qrels, wiki_distinct_gen_results)

wiki_original_results = get_results(wiki_collection, wiki_original_queries, wiki_corpus_ids, wiki_original_query_embeddings)
wiki_original_metrics = evaluate(k_values, wiki_qrels, wiki_original_results)

rows = []
query_types = ["Generated", "Original"]
metrics_data = [wiki_distinct_gen_metrics, wiki_original_metrics]

for query_type, metrics in zip(query_types, metrics_data):
    row_data = {"Query Type": query_type}
    
    # Extract all metrics at different k values
    for metric_name, metric_values in metrics.items():
        for k_metric, value in metric_values.items():
            row_data[f"{metric_name}@{k_metric.split('@')[1]}"] = value
    
    rows.append(row_data)

comparison_df = pd.DataFrame(rows)

comparison_df