In [37]:
%%capture
!pip install llama-index
!pip install llama-index-embeddings-huggingface

In [7]:
import json
import os
import matplotlib.pyplot as plt
import glob
import requests
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import tqdm

import nest_asyncio
nest_asyncio.apply()

from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.llms.openai import OpenAI

import os
import pandas as pd

In [12]:
from typing import List

In [8]:
open_api_key = ""
os.environ['OPENAI_API_KEY'] = open_api_key

### Load data

In [38]:
data = json.load(open('test_set.json'))
labels = json.load(open('test_ground_truth.json'))
len(data), len(labels)

(500, 500)

In [39]:
from llama_index.core import Document

documents = []
for i, q in tqdm.tqdm(data.items()):
    context = str(q['CONTEXTS'])
    documents.append(Document(text=context))

print(len(documents))

100%|██████████| 500/500 [00:00<00:00, 11714.95it/s]

500





In [42]:
import collections
data_dict = collections.defaultdict()

for i, q in tqdm.tqdm(data.items()):
    question = str(q['QUESTION'])
    context = str(q['CONTEXTS'])
    data_dict[question] = context

100%|██████████| 500/500 [00:00<00:00, 37842.43it/s]


## Build DB

In [40]:
llm = OpenAI(model="gpt-4")

In [41]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
# nodes = node_parser.get_nodes_from_documents(list_reader)

from llama_index.core.embeddings import resolve_embed_model
embed_model = resolve_embed_model("local:BAAI/bge-small-en")

vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Running example

In [53]:
query_engine = vector_index.as_query_engine()
response_vector = query_engine.query("Is there a connection between sublingual varices and hypertension?")
response_vector.response

'Yes, there is a connection between sublingual varices and hypertension.'

## Evaluation

In [62]:
def evaluate_metrics(d, vector_index, top_k):
    hits = 0
    reciprocal_ranks = []

    retriever = vector_index.as_retriever(similarity_top_k=top_k)

    for question, true_context in d.items():
        retrieved_items = retriever.retrieve(question)

        hit = False
        for rank, el in enumerate(retrieved_items, start=1):
            if true_context in el.text :
                if not hit:
                    hits += 1
                    reciprocal_ranks.append(1 / rank)
                hit = True

    hit_rate = hits / len(d)
    mrr = sum(reciprocal_ranks) / len(d) if reciprocal_ranks else 0
    return hit_rate, mrr

# Evaluate metrics for top k
top_k_values = [1, 2, 3]
results = {}
for k in top_k_values:
    hit_rate, mrr = evaluate_metrics(data_dict, vector_index, k)
    results[k] = {'Hit Rate': hit_rate, 'MRR': mrr}

for k, metrics in results.items():
    print(f"Top {k}: Hit Rate = {metrics['Hit Rate']:.2f}, MRR = {metrics['MRR']:.4f}")

Top 1: Hit Rate = 0.98, MRR = 0.9840
Top 2: Hit Rate = 0.99, MRR = 0.9870
Top 3: Hit Rate = 0.99, MRR = 0.9870
