# Imports

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
import time
import csv
import json
from sentence_transformers import SentenceTransformer, util

In [3]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': 'd588378d475d',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'BtCeb_f9ThK4DCX5TfYNEg',
 'version': {'number': '8.3.1',
  'build_type': 'docker',
  'build_hash': 'b9a6b2867996ba92ceac66cb5bafc6db25e7910e',
  'build_date': '2022-06-29T18:39:55.731992798Z',
  'build_snapshot': False,
  'lucene_version': '9.2.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

# Load the document

In [4]:
df = pd.read_csv("wikIR1k/documents.csv")

# Create index

In [6]:
mappings = {
    "properties": {
        "text_right": {"type": "text"}
    }
}

es.indices.create(index="documents", mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents'})

Delete index if any problem

In [5]:
es.indices.delete(index="documents")

ObjectApiResponse({'acknowledged': True})

# Index document

In [7]:
bulk_data = []
for i,row in df.iterrows():
    bulk_data.append(
        {
            "_index" : "documents",
            "_id" : row["id_right"],
            "_source": {        
                "text_right": row["text_right"],
            }
        }
    )

bulk(es, bulk_data)

(369721, [])

# Read test queries

In [8]:
qdf = pd.read_csv("wikIR1k/test/queries.csv")

# Run test queries for top-20

In [10]:
quintuples = []

for i,row in qdf.iterrows():
    quintuple = []
    resp = es.search(
        index="documents",
        body={
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text_right": row["text_left"],
                        }
                    }
                }
            }
        },
        size=20
    )
    
    for hit in resp['hits']['hits']:
        quintuple.append([row["id_left"], row["text_left"], hit['_id'], 
                          hit['_score'], hit['_source']['text_right']])
        
    quintuples.append(quintuple)

  resp = es.search(


# Replacing es score by dot product

In [11]:
model = SentenceTransformer('msmarco-distilbert-dot-v5')

In [12]:
for query in quintuples:
    for quintuple in query:
        query_embedding = model.encode(quintuple[1])
        passage_embedding = model.encode(quintuple[4])
        quintuple[3] = float(util.dot_score(query_embedding, passage_embedding)[0][0])

# Sorting by the new score

In [13]:
def score(elem):
    return elem[3]

for query in quintuples:
    query.sort(key=score, reverse=True)

# Format in TREC format

In [14]:
with open("qrels", "w") as qf:
    for query in quintuples:
        for quintuple in query:
            if quintuple[0] == int(quintuple[2]):
                rel = "2"
            else:
                rel = "1"
            qf.write(str(quintuple[0]) + "\t0\t" + str(quintuple[2]) + "\t" + rel + "\n")

In [15]:
with open("cos.res", "w") as cf:
    for query in quintuples:
        rank = 0
        for quintuple in query:
            cf.write(str(quintuple[0]) + " Q0 " + str(quintuple[2]) + " " + str(rank) + 
                     " " + str(quintuple[3]) + " ES\n")
            rank += 1

# Evaluation

In [16]:
import ir_measures
from ir_measures import *

In [18]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
run = ir_measures.read_trec_run('cos.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{P@10: 0.22899999999999995,
 P@20: 0.1480000000000001,
 AP@20: 0.17797525806652736}

In [19]:
qrels = ir_measures.read_trec_qrels('wikIR1k/test/qrels')
run = ir_measures.read_trec_run('wikIR1k/test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{P@10: 0.1319999999999999,
 P@20: 0.09499999999999999,
 AP@20: 0.0974761709661997}

In [20]:
qrels = ir_measures.read_trec_qrels('qrels')
run = ir_measures.read_trec_run('wikIR1k/test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{P@10: 0.5545454545454546,
 P@20: 0.48232323232323215,
 AP@20: 0.46159871962333476}