In [1]:
from core.vsm_visualizer import VectorSpaceModel
from core.query import QueryHandler
from core.embeddings import EmbeddingSearcher
import ir_datasets
import ir_measures
from ir_measures import nDCG, P, Judged, R, AP, RR,AP_IA
import pandas as pd
import json

  from tqdm.autonotebook import tqdm, trange


## Initializing Datasets and Variables

In [2]:
dataset = ir_datasets.load("antique/test/non-offensive")
vsm = VectorSpaceModel.load("antique_state.pkl")
embed = EmbeddingSearcher.load("antique_embed.pkl")
query_handler = QueryHandler(vsm)

### Add the qrels into dataframe

In [3]:
qrels_rows = []

for qrel in dataset.qrels_iter():
    new_row = {'query_id': qrel.query_id,
                   'doc_id': qrel.doc_id, 'relevance': qrel.relevance}
    qrels_rows.append(new_row)

qrels_table_results = pd.DataFrame(
    qrels_rows, columns=['query_id', 'doc_id', 'relevance'])

### Initializing Queries without embedding

In [4]:
# Get queries
queries = dataset.queries_iter()

query_rows = []
for index, (query_id, query_text) in enumerate(queries):
    # search for the query
    search_results = query_handler.search(query_text,similarity_threshold= 0.001)
    for (document, similarity) in search_results:
        new_row = {'query_id': query_id,
                   'doc_id': document[0], 
                   'score': similarity}
        query_rows.append(new_row)
        # retrieval_results.append((query_id,document[0],similarity))


query_table_results = pd.DataFrame(
    query_rows, columns=['query_id', 'doc_id', 'score'])



### Initializing Queries with embedding

In [5]:
# Get queries

queries = dataset.queries_iter()

query_embed_rows = []
for index, (query_id, query_text) in enumerate(queries):
    # search for the query
    search_results = embed.search(query_text,similarity_threshold=0.25)
    for (document, similarity) in search_results:
        new_row = {'query_id': query_id,
                   'doc_id': document[0], 
                   'score': similarity}
        query_embed_rows.append(new_row)
        # retrieval_results.append((query_id,document[0],similarity))


query_embed_results = pd.DataFrame(
    query_embed_rows, columns=['query_id', 'doc_id', 'score'])

### Claculate The Evaluation Results

#### Results with embeddings 

In [6]:
ir_measures.calc_aggregate(
    [AP(rel=3),
    P(rel=1)@10,
    R(rel=1)@10,
    RR(rel=3),
    ], 
    qrels_table_results,
    query_embed_results
    )

{AP(rel=3): 0.2934999571497649,
 R@10: 0.15516703579285385,
 P@10: 0.46477272727272717,
 RR(rel=3): 0.7089781417842417}

{AP(rel=3): 0.2934999571497649,
 R@10: 0.15516703579285385,
 P@10: 0.46477272727272717,
 RR(rel=3): 0.7089781417842417}

#### Results without embeddings

In [7]:
ir_measures.calc_aggregate(
    [AP(rel=1),
    P(rel=1)@10,
    R(rel=1)@10,
    RR(rel=1),
    ], 
    qrels_table_results,
    query_table_results
    )

{AP: 0.22596371811330124,
 RR: 0.7290534129990595,
 P@10: 0.3926136363636364,
 R@10: 0.13217329206111822}

{AP: 0.22596371811330124,
 RR: 0.7290534129990595,
 P@10: 0.3926136363636364,
 R@10: 0.13217329206111822}