<a href="https://colab.research.google.com/github/jlonge4/gen_ai_utils/blob/main/dynamic_top_k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install farm-haystack[beir] beir fmeval

In [None]:
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
import numpy as np
import pandas as pd


class DynamicRetriever(BaseComponent):
    outgoing_edges = 1

    def __init__(self, weight):
        self.weight = weight
        self.docs_length = []

    def dynamic_top_k(self, weight, results,  min=5, max=100):
        #change top_k returned dynamically
        final = []
        # when next result score is less than the  previous score - one standard deviation, stop adding context
        # if len(results) < min:
        #     return results
        scores = [result.score for result in results]
        #if results len is greater than 20, calc std_dev on top 20 only
        if len(results) > 20:
            scores = scores[:20]
            std_dev = np.std(scores)
        else:
          std_dev = np.std(scores)
        # sort results on score
        results = sorted(results, key=lambda x: x.score, reverse=True)
        print(std_dev)
        final = []
        for i in range(len(results)):
            if len(final) >= max:
                break
            if i > 0 and (results[i].score + (std_dev*weight)) < results[i-1].score: #0.35
                break
            final.append(results[i])
        print(f'Dropping {len(results) - len(final)} docs: ')
        self.docs_length.append({"Starting num": len(results), "Final": len(final)})
        return final

    def run(self, documents) -> tuple[dict[str, list[Document]], str]:
        documents = self.dynamic_top_k(self.weight, documents)
        output = {
            "documents": documents,
        }
        return output, "output_1"

    def run_batch(self, documents) -> tuple[dict[str, list[Document]], str]:
        pass

def eval(weight):
    from haystack.pipelines import DocumentSearchPipeline, Pipeline
    from haystack.nodes import TextConverter, BM25Retriever, EmbeddingRetriever, JoinDocuments
    from haystack.document_stores import InMemoryDocumentStore

    text_converter = TextConverter()
    document_store = InMemoryDocumentStore(use_bm25=True)
    retriever = BM25Retriever(document_store=document_store, top_k=1000)
    dynamic_retrieve = DynamicRetriever(weight=weight)

    index_pipeline = Pipeline()
    index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
    # index_pipeline.add_node(dense_retriever, name="DenseRetriever", inputs=["TextConverter"])
    index_pipeline.add_node(document_store, name="DocumentStore", inputs=["TextConverter"])

    query_pipeline = Pipeline()
    # query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=dynamic_retrieve, name="ReRanker", inputs=["SparseRetriever"])

    ndcg_, _map_, recall_, precision_ = Pipeline.eval_beir(
        index_pipeline=index_pipeline, query_pipeline=query_pipeline, dataset="scifact"
    )
    recall_items = [v for k, v in recall_.items()]
    precision_items = [v for k, v in precision_.items()]
    #Put all results in a neat dataframe ie: top_k 1, 3, 5, 10, 100, 1000 will be an index, and the metrics will be columns, include weight as a column
    df_dynamic = pd.DataFrame([recall_items, precision_items], columns=[1, 3, 5, 10, 100, 1000], index=['recall', 'precision'])
    df_dynamic['weight'] = weight
    # print(rerank.docs_length)
    # make df from rerank.docs_length dict
    df_dynamic_docs = pd.DataFrame(dynamic_retrieve.docs_length)
    print(df_dynamic_docs)
    return df_dynamic.T, df_dynamic_docs
# iteratively run this eval passing in new weight values between 0.2 and 1.2 in .1 increments
results = []
results_1 = []
docs_dropped = []
for i in range(0, 10, 1):
    r, d = eval(float(f'0.{i}'))
    results.append(r)
    docs_dropped.append(d)
    # results_1.append(eval(float(f'1.{i}')))
    print(f'Completed {i}')

In [38]:
results[2]

Unnamed: 0,recall,precision
1,0.48083,0.49667
3,0.54278,0.18889
5,0.54806,0.11533
10,0.56083,0.06
100,0.56083,0.006
1000,0.56083,0.0006
weight,0.2,0.2


In [None]:
for i in range(0, 10, 1):
    r, d = eval(float(f'1.{i}'))
    results_1.append(r)
    docs_dropped.append(d)
    print(f'Completed {i}')

In [59]:
best_weight = None
best_score = 0
total = results + results_1
for i in results:
    score = i.drop(index=['weight', 1000]).precision.sum()
    if score > best_score:
        best_score = score
        best_weight = i
print(best_weight.drop(index=[1, 3]))

         recall  precision
5       0.67667    0.14467
10      0.71622    0.07833
100     0.76767    0.00860
1000    0.76767    0.00086
weight  0.90000    0.90000


In [28]:
def eval_normal():
    from haystack.pipelines import DocumentSearchPipeline, Pipeline
    from haystack.nodes import TextConverter, BM25Retriever, EmbeddingRetriever, JoinDocuments
    from haystack.document_stores import InMemoryDocumentStore

    text_converter = TextConverter()
    document_store = InMemoryDocumentStore(use_bm25=True)
    retriever = BM25Retriever(document_store=document_store, top_k=1000)
    # rerank = DynamicRetrieverFinal(weight=weight)

    index_pipeline = Pipeline()
    index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
    # index_pipeline.add_node(dense_retriever, name="DenseRetriever", inputs=["TextConverter"])
    index_pipeline.add_node(document_store, name="DocumentStore", inputs=["TextConverter"])

    query_pipeline = Pipeline()
    # query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    # query_pipeline.add_node(component=rerank, name="ReRanker", inputs=["SparseRetriever"])
    ndcg_, _map_, recall_, precision_ = Pipeline.eval_beir(
          index_pipeline=index_pipeline, query_pipeline=query_pipeline, dataset="scifact"
      )
    recall_items = [v for k, v in recall_.items()]
    precision_items = [v for k, v in precision_.items()]
    #Put all results in a neat dataframe ie: top_k 1, 3, 5, 10, 100, 1000 will be an index, and the metrics will be columns, include weight as a column
    df_dynamic = pd.DataFrame([recall_items, precision_items], columns=[1, 3, 5, 10, 100, 1000], index=['recall', 'precision'])

    return df_dynamic.T
results_normal = eval_normal()
results_normal

  0%|          | 0/5183 [00:00<?, ?it/s]

Converting files: 100%|██████████| 5183/5183 [00:03<00:00, 1630.57it/s]
Updating BM25 representation...: 100%|██████████| 5183/5183 [00:00<00:00, 12482.80 docs/s]
100%|██████████| 300/300 [00:09<00:00, 30.04it/s]


Unnamed: 0,recall,precision
1,0.48083,0.49667
3,0.66167,0.23444
5,0.70778,0.152
10,0.76233,0.08367
100,0.86389,0.00973
1000,0.951,0.00108


In [61]:
from pprint import pprint
pprint(results_normal)
pprint('#' * 50)
pprint(best_weight.drop(index=['weight']))

       recall  precision
1     0.48083    0.49667
3     0.66167    0.23444
5     0.70778    0.15200
10    0.76233    0.08367
100   0.86389    0.00973
1000  0.95100    0.00108
'##################################################'
       recall  precision
1     0.48083    0.49667
3     0.63722    0.22444
5     0.67667    0.14467
10    0.71622    0.07833
100   0.76767    0.00860
1000  0.76767    0.00086


Obvious efficiency improvement but not sure how to properly quantify this!

In [55]:
docs_dropped[9]['Final'].sum()

13984

In [58]:
docs_dropped[9]['Starting num'].sum()

300000

CONCLUSION: The problem with my method is that I am fairly certain the beir eval is using its respsected top_k number of docs to calculate recall/precision...not the actual number of documents returned as evidenced by the k=100 and k=1000 staying the same.