# Evaluate bo767 retrieval recall accuracy with NV-Ingest and Milvus

In this notebook, we'll use NV-ingest and LlamaIndex to get the end-to-end recall accuracy of a retrieval pipeline made up of NV-Ingest's extraction and embedding tasks and a Milvus vector database (VDB).

In [62]:
from pymilvus import MilvusClient

milvus_client = MilvusClient("http://localhost:19530")
milvus_client.drop_collection(collection_name='bo767')

## Ingestion

In [None]:
from nv_ingest_client.client import Ingestor

ingestor = (
    Ingestor()
    .files("../data/nv-ingest/bo767/*.pdf")
    .extract(
        extract_text=True,
        extract_tables=True,
        extract_charts=True,
        extract_images=False,
        text_depth="page",
    ).embed()
    .vdb_upload(
        collection_name="bo767",
        stream=False,
    )
)

results = ingestor.ingest(show_progress=True)

In [37]:
len(results)

767

In [3]:
# Optional: save results
import pickle

with open('bo767_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [26]:
# Optional: load results
import pickle

with open('bo767_results.pkl', 'rb') as f:
    results = pickle.load(f)

## Recall

In [23]:
from nv_ingest_client.util.milvus import nvingest_retrieval
from collections import defaultdict
import os
import numpy as np

def get_recall_scores(query_df, collection_name):
    hits = defaultdict(list)
    all_answers = nvingest_retrieval(
        query_df["query"].to_list(),
        collection_name,
        hybrid=False,
        milvus_uri="http://localhost:19530",
        embedding_endpoint="http://localhost:8012/v1",
        model_name="nvidia/llama-3.2-nv-embedqa-1b-v2",
        top_k=10,
        gpu_search=False,
        
    )

    for i in range(len(query_df)):
        expected_pdf_page = query_df['pdf_page'][i]
        retrieved_answers = all_answers[i]
        retrieved_pdfs = [os.path.basename(result['entity']['source']['source_id']).split('.')[0] for result in retrieved_answers]
        retrieved_pages = [str(result['entity']['content_metadata']['page_number']) for result in retrieved_answers]
        retrieved_pdf_pages = [f"{pdf}_{page}" for pdf, page in zip(retrieved_pdfs, retrieved_pages)]    

        for k in [1, 3, 5, 10]:
            hits[k].append(expected_pdf_page in retrieved_pdf_pages[:k])
    
    for k in hits:
        print(f'  - Recall @{k}: {np.mean(hits[k]) :.3f}')

In [61]:
df_query = pd.read_csv('../data/bo767_query_gt.csv')
df_query

Unnamed: 0,query,pdf,page,modality,pdf_page
0,How much was the ARtillery Intelligence projec...,1102434,19,text,1102434_20
1,How much revenue of AR advertising is expected...,1102434,3,text,1102434_4
2,What types of statistics were utilized by Rein...,1096078,3,text,1096078_4
3,What was the maximum amount requested for cond...,1054125,1,text,1054125_2
4,What is the median household income for the Ci...,1246906,7,text,1246906_8
...,...,...,...,...,...
986,"After the 2008 recession, what percentage of p...",2384395,6,chart,2384395_7
987,what were the top 3 major religious groups in ...,2392676,5,chart,2392676_6
988,What percentage of people in the world identif...,2392676,5,chart,2392676_6
989,"Between 2003 and 2019, has the household mortg...",2410699,189,chart,2410699_190


In [44]:
get_recall_scores(df_query, "bo767")

  - Recall @1: 0.582
  - Recall @3: 0.796
  - Recall @5: 0.854
  - Recall @10: 0.903


In [58]:
for modality in df_query.modality.unique():
    print(modality)
    get_recall_scores(df_query.query(f"modality=='{modality}'").reset_index(drop=True), "multimodal_nr_2")

text
  - Recall @1: 0.611
  - Recall @3: 0.816
  - Recall @5: 0.865
  - Recall @10: 0.916
table
  - Recall @1: 0.455
  - Recall @3: 0.719
  - Recall @5: 0.796
  - Recall @10: 0.860
chart
  - Recall @1: 0.642
  - Recall @3: 0.828
  - Recall @5: 0.884
  - Recall @10: 0.918
