# Imports

In [34]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
import time
import csv
import json

In [2]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '46e14d42de6c',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'flpIrxYNSAS8AyXKm8HB5A',
 'version': {'number': '8.3.1',
  'build_type': 'docker',
  'build_hash': 'b9a6b2867996ba92ceac66cb5bafc6db25e7910e',
  'build_date': '2022-06-29T18:39:55.731992798Z',
  'build_snapshot': False,
  'lucene_version': '9.2.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

# Load the document

In [3]:
df = pd.read_csv("documents.csv")

# Create Index

In [4]:
mappings = {
    "properties": {
        "text_right": {"type": "text"}
    }
}

In [20]:
es.indices.create(index="documents", mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents'})

Delete Index if any problem

In [19]:
es.indices.delete(index="stem_documents")

ObjectApiResponse({'acknowledged': True})

# Index document

In [21]:
start_time = time.time()

bulk_data = []
for i,row in df.iterrows():
    bulk_data.append(
        {
            "_index" : "documents",
            "_id" : row["id_right"],
            "_source": {        
                "text_right": row["text_right"],
            }
        }
    )

bulk(es, bulk_data)

print("--- %s seconds ---" % (time.time() - start_time))

--- 53.17474389076233 seconds ---


# Create new index for stemming

In [7]:
stem_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter":[
                        "lowercase",
                        "english_stem"
                    ]
                }
            },
            "filter": {
                "english_stem": {
                    "type": "stemmer",
                    "language": "english"
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text_right": {
                "type": "text",
                "analyzer": "my_analyzer"
            }
        }
    }
}

In [22]:
es.indices.create(index='stem_documents', body=stem_settings)

  es.indices.create(index='stem_documents', body=stem_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'stem_documents'})

# Index document to new index with stemming

In [23]:
start_time = time.time()

stem_bulk_data = []

for i,row in df.iterrows():
    stem_bulk_data.append(
        {
            "_index": "stem_documents",
            "_id": row["id_right"],
            "_source": {        
                "text_right": row["text_right"],
            }
        }
    )

bulk(es, stem_bulk_data)

print("--- %s seconds ---" % (time.time() - start_time))

--- 57.96376705169678 seconds ---


# Read test queries

In [10]:
qdf = pd.read_csv("queries.csv")

# Run queries on index with no stemming

In [38]:
triples = []
total_time = 0
number_of_queries = 0

for i,row in qdf.iterrows():
    triple = []
    resp = es.search(
        index="documents",
        body={
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text_right": row["text_left"],
                        }
                    }
                }
            }
        },
        size=20
    )
    for hit in resp['hits']['hits']:
        triple.append((row["id_left"], hit['_id'], hit['_score']))
    total_time += resp['took']
    number_of_queries += 1
    
    triples.append(triple)
    
print("Average query time is: ")
print(total_time/number_of_queries)

  resp = es.search(


Average query time is: 
2.16


In [35]:
with open('triples_no_stemming.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(triples)

# Run queries on index with stemming

In [39]:
stem_triples = []
total_time = 0

for i,row in qdf.iterrows():
    stem_triple = []
    resp = es.search(
        index="stem_documents",
        body={
            "query": {
                "bool": {
                    "should": {
                        "match": {
                            "text_right": row["text_left"],
                        }
                    }
                }
            }
        },
        size=20
    )
    for hit in resp['hits']['hits']:
        stem_triple.append((row["id_left"], hit['_id'], hit['_score']))
    total_time += resp['took']
    
    stem_triples.append(stem_triple)
    
print("Average query time is: ")
print(total_time/number_of_queries)

  resp = es.search(


Average query time is: 
2.1


In [37]:
with open('triples_with_stemming.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(stem_triples)

# Formatting my runs in TREC format

In [43]:
with open("qrels_no_stemming", "w") as qnsf:
    for query in triples:
        for triple in query:
            if triple[0] == int(triple[1]):
                rel = "2"
            else:
                rel = "1"
            qnsf.write(str(triple[0]) + "\t0\t" + str(triple[1]) + "\t" + rel + "\n")

In [44]:
with open("qrels_with_stemming", "w") as qwsf:
    for query in stem_triples:
        for triple in query:
            if triple[0] == int(triple[1]):
                rel = "2"
            else:
                rel = "1"
            qwsf.write(str(triple[0]) + "\t0\t" + str(triple[1]) + "\t" + rel + "\n")

In [47]:
with open("es_no_stemming.res", "w") as esnsf:
    for query in triples:
        rank = 0
        for triple in query:
            esnsf.write(str(triple[0]) + " Q0 " + str(triple[1]) + " " + str(rank) + " " + str(triple[2]) + " ES\n")
            rank += 1

In [48]:
with open("es_with_stemming.res", "w") as eswsf:
    for query in stem_triples:
        rank = 0
        for triple in query:
            eswsf.write(str(triple[0]) + " Q0 " + str(triple[1]) + " " + str(rank) + " " + str(triple[2]) + " ES\n")
            rank += 1

In [49]:
import ir_measures
from ir_measures import *

In [50]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('es_no_stemming.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{AP: 0.14804628973499517, P@10: 0.20699999999999988, P@20: 0.14850000000000008}

In [51]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('es_with_stemming.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{AP: 0.1460596884041505, P@10: 0.20999999999999994, P@20: 0.142}

In [52]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{AP: 0.11196168401599797, P@10: 0.1319999999999999, P@20: 0.09499999999999999}

In [53]:
qrels = ir_measures.read_trec_qrels('qrels_no_stemming')
run = ir_measures.read_trec_run('test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{AP: 0.5194326254697494, P@10: 0.5545454545454546, P@20: 0.48282828282828266}

In [54]:
qrels = ir_measures.read_trec_qrels('qrels_with_stemming')
run = ir_measures.read_trec_run('test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{AP: 0.5471741948123588, P@10: 0.6111111111111109, P@20: 0.5055555555555554}