In [3]:
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore

elastic_doc_store = ElasticsearchDocumentStore(hosts= "http://localhost:9200")
elastic_bm25_retriever = ElasticsearchBM25Retriever(document_store=elastic_doc_store)

In [10]:
from haystack_integrations.components.retrievers.weaviate.bm25_retriever import WeaviateBM25Retriever
from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore

weaviate_doc_store = WeaviateDocumentStore(url="http://localhost:8088")
weaviate_bm25_retriever = WeaviateBM25Retriever(document_store=weaviate_doc_store)

In [9]:
question = "What is Chromogisaurus?"

elastic_fetched = elastic_bm25_retriever.run(query=question, top_k=1)

elastic_fetched

{'documents': [Document(id=1a804fab-317c-4baf-b95c-fa864fe2a906, content: 'The Ischigualasto Formation (radiometrically dated at 231-230 million years old) has produced the ea...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 40}, score: 5.6333437)]}

In [12]:
econt = elastic_fetched['documents'][0].content
econt


"The Ischigualasto Formation (radiometrically dated at 231-230 million years old) has produced the early saurischian Eoraptor , originally considered a member of the Herrerasauridae but now considered to be an early sauropodomorph, along with the herrerasaurids Herrerasaurus and Sanjuansaurus , and the sauropodomorphs Chromogisaurus , Eodromaeus , and Panphagia . Eoraptor 's likely resemblance to the common ancestor of all dinosaurs suggests that the first dinosaurs would have been small, bipedal predators. The Santa Maria Formation (radiometrically dated to be older, at 233.23 million years old) has produced the herrerasaurids Gnathovorax and Staurikosaurus , along with the sauropodomorphs Bagualosaurus , Buriolestes , Guaibasaurus , Macrocollum , Nhandumirim , Pampadromaeus , Saturnalia , and Unaysaurus . The Pebbly Arkose Formation, which is of uncertain age but was likely comparable to the other two, has produced the sauropodomorph Mbiresaurus , along with an unnamed herrerasaurid.

In [14]:
escore = elastic_fetched['documents'][0].score
escore

5.6333437

In [11]:
weaviate_fetched = weaviate_bm25_retriever.run(query=question, top_k=1)

weaviate_fetched


{'documents': [Document(id=a0ec45c3-c0cd-4604-a176-594456f11b36, content: 'Scholarly descriptions of what would now be recognized as dinosaur bones first appeared in the late ...', meta: {'split_id': 23.0, 'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'file_path': 'dinosaur-page.html'}, score: 1.9453471899032593, embedding: vector of size 1536)]}

In [17]:
question = "What is Chromogisaurus?"

elastic_fetched = elastic_bm25_retriever.run(query=question, top_k=1)
econt = elastic_fetched['documents'][0].content
escore = elastic_fetched['documents'][0].score

weaviate_fetched = weaviate_bm25_retriever.run(query=question, top_k=1)
wcont = weaviate_fetched['documents'][0].content
wscore = weaviate_fetched['documents'][0].score

term_chunk_dict = {
    "What is Chromogisaurus?": "The Ischigualasto Formation (radiometrically dated at 231-230 million years old) has produced the early saurischian Eoraptor",
    "Where are Neornithes?": "Under phylogenetic nomenclature, dinosaurs are usually defined as the group",
    "What is Passer domesticus?": "Research by Matthew G. Baron, David B. Norman, and Paul M. Barrett in 2017",
    "What is Dimetrodon?": "Using one of the above definitions, dinosaurs can be generally described as archosaurs",
    "Who was Gideon Mantell?": "Between 1815 and 1824, the Rev William Buckland, the first Reader of Geology",
    "What is Hylaeosaurus?": "soon became of great interest to European and American scientists, and in 1842 the English paleontologist Sir",
    "Who was Weishampel?": "Prior to the dinosaur renaissance, dinosaurs were mostly classified using the traditional",
    "What was Edmontosaurus annectens?": "Dinosaur fossils are not limited to bones, but also include imprints or mineralized remains",
    "What is Sinosauropteryx?": "Starting from the 1990s, major discoveries of exceptionally preserved fossils in deposits known",
    "Tell me something about Scipionyx": "Starting from the 1990s, major discoveries of exceptionally preserved fossils in deposits known",
    "What is immunohistochemical technique?": "Concurrently, a line of work led by Mary Higby Schweitzer, Jack Horner, and colleagues",
    "specimen of Hypacrosaurus":"Concurrently, a line of work led by Mary Higby Schweitzer, Jack Horner, and colleagues",
    "What is Jaklapallisaurus?": "Less well-preserved remains of the sauropodomorphs Jaklapallisaurus and Nambalia",
    "What is crocodylomorphs?": "When dinosaurs appeared, they were not the dominant terrestrial animals.",
    "What is geranoidids?": "The surviving lineages of neornithine birds, including the ancestors of modern ratites",
    "What is mihirungs?": "The surviving lineages of neornithine birds, including the ancestors of modern ratites",

}

In [18]:
# Initialize counters for correct fetches
ecorrect_count = 0
wcorrect_count = 0

# Initialize a list to store the results
results = []

# Loop over term_chunk_dict
for question, expected_substring in term_chunk_dict.items():
    # Fetch results from Elasticsearch BM25 retriever
    elastic_fetched = elastic_bm25_retriever.run(query=question, top_k=1)
    econt = elastic_fetched['documents'][0].content
    escore = elastic_fetched['documents'][0].score

    # Fetch results from Weaviate BM25 retriever
    weaviate_fetched = weaviate_bm25_retriever.run(query=question, top_k=1)
    wcont = weaviate_fetched['documents'][0].content
    wscore = weaviate_fetched['documents'][0].score

    # Check if the expected substring is in the fetched content
    ecorrect = expected_substring in econt
    wcorrect = expected_substring in wcont

    # Update counters
    if ecorrect:
        ecorrect_count += 1
    if wcorrect:
        wcorrect_count += 1

    # Create a result dictionary for the current question
    result = {
        "question": question,
        "ecorrect": ecorrect,
        "wcorrect": wcorrect,
        "escore": escore,
        "wscore": wscore
    }

    # Append the result to the results list
    results.append(result)

# Print the results
for result in results:
    print(result)

# Print the counts of correct fetches
print(f"Elasticsearch correct fetches: {ecorrect_count}")
print(f"Weaviate correct fetches: {wcorrect_count}")

{'question': 'What is Chromogisaurus?', 'ecorrect': True, 'wcorrect': False, 'escore': 5.6333437, 'wscore': 1.9453471899032593}
{'question': 'Where are Neornithes?', 'ecorrect': True, 'wcorrect': True, 'escore': 6.0377126, 'wscore': 1.779759407043457}
{'question': 'What is Passer domesticus?', 'ecorrect': True, 'wcorrect': True, 'escore': 8.608093, 'wscore': 3.847595691680908}
{'question': 'What is Dimetrodon?', 'ecorrect': True, 'wcorrect': True, 'escore': 7.6543937, 'wscore': 2.8587589263916016}
{'question': 'Who was Gideon Mantell?', 'ecorrect': True, 'wcorrect': True, 'escore': 17.640362, 'wscore': 7.714737892150879}
{'question': 'What is Hylaeosaurus?', 'ecorrect': True, 'wcorrect': False, 'escore': 4.508199, 'wscore': 1.9453471899032593}
{'question': 'Who was Weishampel?', 'ecorrect': True, 'wcorrect': True, 'escore': 5.4487224, 'wscore': 1.7702919244766235}
{'question': 'What was Edmontosaurus annectens?', 'ecorrect': True, 'wcorrect': True, 'escore': 8.038213, 'wscore': 3.06295

- 16 queries considered where BM25 should do better than semantic search. 
- Elasticsearch and Weaviate both have the exact same chunks of documents. 
- Elasticsearch BM25 performs better than Weaviate Bm25
    - Elasticsearch: 15/16 with high scores, often >5
    - Weaviate: 7/16 with not so confident scores ~2
- Only one question where Elasticsearch was wrong: "Tell me something about Scipionyx"
    - Possibly short terse search queries with unique words are easier, long sentences where target term is hidden among common words are difficult
    - Let's check if Elasticsearch and Weaviate would have had the target chunk in top3 atleast

Result: **Elasticsearch BM25 haystack integration out-of-the-box is better than Weaviate BM25 haystack integration out-of-the-box.**

More experiments -

In [19]:
question = "Tell me something about Scipionyx"

elastic_fetched = elastic_bm25_retriever.run(query=question, top_k=3)
weaviate_fetched = weaviate_bm25_retriever.run(query=question, top_k=3)




----------------------


In [20]:
elastic_fetched

{'documents': [Document(id=42752ffb-540e-4bd2-bc9c-0e39c858765f, content: 'The popular preoccupation with dinosaurs has ensured their appearance in literature, film, and other...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 127}, score: 4.8926344),
  Document(id=25c2e0c0-ca9f-4c1f-ae85-23f8dab8ee3e, content: 'The smallest dinosaur known is the bee hummingbird, with a length of only 5 centimeters (2.0 in) and...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 73}, score: 3.8518543),
  Document(id=20ad4c35-b545-440a-9d67-3e839bb7a68f, content: 'The crests and frills of some dinosaurs, like the marginocephalians, theropods and lambeosaurines, m...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 77}, score: 3.377768

In [21]:
weaviate_fetched

{'documents': [Document(id=58389b00-00c1-4da6-9eb7-cc189cae832f, content: 'The smallest dinosaur known is the bee hummingbird, with a length of only 5 centimeters (2.0 in) and...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 73.0, 'file_path': 'dinosaur-page.html'}, score: 1.7288570404052734, embedding: vector of size 1536),
  Document(id=bda7c905-e2f1-48b5-a2c2-8804c3082bf5, content: 'When dinosaurs appeared, they were not the dominant terrestrial animals. The terrestrial habitats we...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 42.0, 'file_path': 'dinosaur-page.html'}, score: 1.4528007507324219, embedding: vector of size 1536),
  Document(id=1bf410cd-051a-4514-9af8-e5195f68c914, content: 'The popular preoccupation with dinosaurs has ensured their appearance in literature, film, and other...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5d

What if we only gave the word "Scipionyx"

In [22]:
question = "Scipionyx"

elastic_fetched = elastic_bm25_retriever.run(query=question, top_k=3)
weaviate_fetched = weaviate_bm25_retriever.run(query=question, top_k=3)

In [23]:
elastic_fetched

{'documents': [Document(id=8a774d6a-a007-4925-91ce-0e9724619862, content: 'Starting from the 1990s, major discoveries of exceptionally preserved fossils in deposits known as c...', meta: {'file_path': '../dinosaur-page.html', 'source_id': '397c39cd02b4ddec650906e433b995103ffd8164879c3fa3cc6a9956405e652f', 'split_id': 35}, score: 3.050006)]}

In [24]:
weaviate_fetched

{'documents': [Document(id=dd898970-de03-4d55-9136-9ae2039056b4, content: 'Starting from the 1990s, major discoveries of exceptionally preserved fossils in deposits known as c...', meta: {'source_id': 'cccb7d931975c65babedb9ad3867015939fba680621923f60517da5e5dea3c32', 'split_id': 35.0, 'file_path': 'dinosaur-page.html'}, score: 1.398360013961792, embedding: vector of size 1536)]}

Both got it correct, albeit Elasticsearch had a higher score.

May be if common words are stripped off and only distinct keywords are provided as query to BM25 retriever, it would do a good job.

In this case, strip off common words 'tell', 'me', 'something', 'about' and only give words such as 'Scipionyx' to BM25. Keep only low frequency words for keyword search.