# Evaluate a ranking algorithm
Compare results of querying an index against the labeled results using NDCG.

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import hashlib
import json
import os
import re

import pandas as pd
import pinecone
from tqdm.autonotebook import tqdm
import wandb

from models.rank_eval import get_embeddings_fn, get_ndcg

In [None]:
# configure
labels_path = "../data/rank_eval/labeled_results/2023-04-25.json"

results_dir = "../data/rank_eval/unlabeled_results/"

# pinecone
pinecone_key = os.environ["PINECONE_API_KEY"]
pinecone_region = os.environ["PINECONE_ENV"]
pinecone_index = "conf-ada-002"

# query embedder
embedding_model = "text-embedding-ada-002"
# embedding_model = "all-mpnet-base-v2"
# embedding_model = "multi-qa-mpnet-base-dot-v1"
# embedding_model = "multi-qa-MiniLM-L6-cos-v1"
# embedding_model = "multi-qa-distilbert-cos-v1"
# embedding_model = "embed-english-v2.0"

get_embeddings = get_embeddings_fn(embedding_model)

In [None]:
# init Weights and Biases
wandb.init(
    project="iloveconference",
    name="39_rank_eval_index",
    notes="",
    config={
        "labels_path": labels_path,
        "embedding_model": embedding_model,
    }
)

In [None]:
# init pinecone
pinecone.init(
    api_key=pinecone_key,
    environment=pinecone_region,
)
# connect to index
index = pinecone.Index(pinecone_index)

## Read labels

In [None]:
with open(labels_path, 'r') as f:
    labels = json.load(f)
print(len(labels))

### Make result id a hash of result contents so we can compare prod to dev ids

In [None]:
def hash_result(text):
    text = re.sub(r"\s+", "", text).lower().strip()
    return hashlib.md5(text.encode()).hexdigest()

In [None]:
labels = [{'query': label['query'],
           'results': [
               {'id': hash_result(result['text']),
                'score': result['score'],
                'text': result['text']
               } for result in label['results']]
          } for label in labels]

In [None]:
labels[0]

## Compute Average Normalized Discounted Cumulative Gain

In [None]:
first_pass = True
for ndcg_at in [10, 5, 3]:
    print(f"\nNDCG@{ndcg_at}")
    total_ndcg = 0.0
    query_results = []
    for label in labels:
        query = label["query"]
        query_embedding = get_embeddings([query])[0]
        res = index.query([query_embedding], top_k=20, include_metadata=True)
        query_results.extend([
            {"query": query, "text": match["metadata"]["text"].replace("\n", " "), "result": match["id"]}
            for match in res["matches"]])
        results = [{
            "id": hash_result(match["metadata"]["text"]),
            "score": match["score"],
            "text": match["metadata"]["text"],
        } for match in res["matches"]]
        ndcg = get_ndcg(label["results"], results, k=ndcg_at)
        print(query, ndcg)
        total_ndcg += ndcg
        
    # compute metric
    metric = total_ndcg / len(labels)
    metric_label = f"average NDCG@{ndcg_at}"
    print(metric_label, metric)
        
    # save metric to wandb
    wandb.run.summary[metric_label] = metric    
    
    # save query results
    if first_pass:
        today = datetime.today().strftime("%Y-%m-%d")
        filename = os.path.join(results_dir, f"{embedding_model}-{today}.csv")
        pd.DataFrame(query_results).to_csv(filename, index=False)        
        first_pass = False

In [None]:
wandb.finish()