<a href="https://colab.research.google.com/github/jlonge4/gen_ai_utils/blob/main/dynamic_top_k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install farm-haystack[beir] farm-haystack[file-conversion] farm-haystack[pdf] beir fmeval

In [None]:
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
import numpy as np
import pandas as pd


class DynamicRetriever(BaseComponent):
    outgoing_edges = 1

    def __init__(self, weight):
        self.weight = weight
        self.docs_length = []

    def dynamic_top_k(self, weight, results,  min=5, max=100):
        #change top_k returned dynamically
        final = []
        # when next result score is less than the  previous score - one standard deviation, stop adding context
        # if len(results) < min:
        #     return results
        scores = [result.score for result in results]
        #if results len is greater than 20, calc std_dev on top 20 only
        if len(results) > 20:
            scores = scores[:20]
            std_dev = np.std(scores)
        else:
          std_dev = np.std(scores)
        # sort results on score
        results = sorted(results, key=lambda x: x.score, reverse=True)
        print(std_dev)
        final = []
        for i in range(len(results)):
            if len(final) >= max:
                break
            if i > 0 and (results[i].score + (std_dev*weight)) < results[i-1].score: #0.35
                break
            final.append(results[i])
        print(f'Dropping {len(results) - len(final)} docs: ')
        self.docs_length.append({"Starting num": len(results), "Final": len(final)})
        return final

    def run(self, documents) -> tuple[dict[str, list[Document]], str]:
        documents = self.dynamic_top_k(self.weight, documents)
        output = {
            "documents": documents,
        }
        return output, "output_1"

    def run_batch(self, documents) -> tuple[dict[str, list[Document]], str]:
        pass

def eval(weight):
    from haystack.pipelines import DocumentSearchPipeline, Pipeline
    from haystack.nodes import TextConverter, BM25Retriever, EmbeddingRetriever, JoinDocuments
    from haystack.document_stores import InMemoryDocumentStore

    text_converter = TextConverter()
    document_store = InMemoryDocumentStore(use_bm25=True)
    retriever = BM25Retriever(document_store=document_store, top_k=1000)
    dynamic_retrieve = DynamicRetriever(weight=weight)

    index_pipeline = Pipeline()
    index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
    # index_pipeline.add_node(dense_retriever, name="DenseRetriever", inputs=["TextConverter"])
    index_pipeline.add_node(document_store, name="DocumentStore", inputs=["TextConverter"])

    query_pipeline = Pipeline()
    # query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=dynamic_retrieve, name="ReRanker", inputs=["SparseRetriever"])

    ndcg_, _map_, recall_, precision_ = Pipeline.eval_beir(
        index_pipeline=index_pipeline, query_pipeline=query_pipeline, dataset="scifact"
    )
    recall_items = [v for k, v in recall_.items()]
    precision_items = [v for k, v in precision_.items()]
    #Put all results in a neat dataframe ie: top_k 1, 3, 5, 10, 100, 1000 will be an index, and the metrics will be columns, include weight as a column
    df_dynamic = pd.DataFrame([recall_items, precision_items], columns=[1, 3, 5, 10, 100, 1000], index=['recall', 'precision'])
    df_dynamic['weight'] = weight
    # print(rerank.docs_length)
    # make df from rerank.docs_length dict
    df_dynamic_docs = pd.DataFrame(dynamic_retrieve.docs_length)
    print(df_dynamic_docs)
    return df_dynamic.T, df_dynamic_docs
# iteratively run this eval passing in new weight values between 0.2 and 1.2 in .1 increments
results = []
results_1 = []
docs_dropped = []
for i in range(0, 10, 1):
    r, d = eval(float(f'0.{i}'))
    results.append(r)
    docs_dropped.append(d)
    # results_1.append(eval(float(f'1.{i}')))
    print(f'Completed {i}')

In [38]:
results[2]

Unnamed: 0,recall,precision
1,0.48083,0.49667
3,0.54278,0.18889
5,0.54806,0.11533
10,0.56083,0.06
100,0.56083,0.006
1000,0.56083,0.0006
weight,0.2,0.2


In [None]:
for i in range(0, 10, 1):
    r, d = eval(float(f'1.{i}'))
    results_1.append(r)
    docs_dropped.append(d)
    print(f'Completed {i}')

In [59]:
best_weight = None
best_score = 0
total = results + results_1
for i in results:
    score = i.drop(index=['weight', 1000]).precision.sum()
    if score > best_score:
        best_score = score
        best_weight = i
print(best_weight.drop(index=[1, 3]))

         recall  precision
5       0.67667    0.14467
10      0.71622    0.07833
100     0.76767    0.00860
1000    0.76767    0.00086
weight  0.90000    0.90000


In [28]:
def eval_normal():
    from haystack.pipelines import DocumentSearchPipeline, Pipeline
    from haystack.nodes import TextConverter, BM25Retriever, EmbeddingRetriever, JoinDocuments
    from haystack.document_stores import InMemoryDocumentStore

    text_converter = TextConverter()
    document_store = InMemoryDocumentStore(use_bm25=True)
    retriever = BM25Retriever(document_store=document_store, top_k=1000)
    # rerank = DynamicRetrieverFinal(weight=weight)

    index_pipeline = Pipeline()
    index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
    # index_pipeline.add_node(dense_retriever, name="DenseRetriever", inputs=["TextConverter"])
    index_pipeline.add_node(document_store, name="DocumentStore", inputs=["TextConverter"])

    query_pipeline = Pipeline()
    # query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
    # query_pipeline.add_node(component=rerank, name="ReRanker", inputs=["SparseRetriever"])
    ndcg_, _map_, recall_, precision_ = Pipeline.eval_beir(
          index_pipeline=index_pipeline, query_pipeline=query_pipeline, dataset="scifact"
      )
    recall_items = [v for k, v in recall_.items()]
    precision_items = [v for k, v in precision_.items()]
    #Put all results in a neat dataframe ie: top_k 1, 3, 5, 10, 100, 1000 will be an index, and the metrics will be columns, include weight as a column
    df_dynamic = pd.DataFrame([recall_items, precision_items], columns=[1, 3, 5, 10, 100, 1000], index=['recall', 'precision'])

    return df_dynamic.T
results_normal = eval_normal()
results_normal

  0%|          | 0/5183 [00:00<?, ?it/s]

Converting files: 100%|██████████| 5183/5183 [00:03<00:00, 1630.57it/s]
Updating BM25 representation...: 100%|██████████| 5183/5183 [00:00<00:00, 12482.80 docs/s]
100%|██████████| 300/300 [00:09<00:00, 30.04it/s]


Unnamed: 0,recall,precision
1,0.48083,0.49667
3,0.66167,0.23444
5,0.70778,0.152
10,0.76233,0.08367
100,0.86389,0.00973
1000,0.951,0.00108


In [61]:
from pprint import pprint
pprint(results_normal)
pprint('#' * 50)
pprint(best_weight.drop(index=['weight']))

       recall  precision
1     0.48083    0.49667
3     0.66167    0.23444
5     0.70778    0.15200
10    0.76233    0.08367
100   0.86389    0.00973
1000  0.95100    0.00108
'##################################################'
       recall  precision
1     0.48083    0.49667
3     0.63722    0.22444
5     0.67667    0.14467
10    0.71622    0.07833
100   0.76767    0.00860
1000  0.76767    0.00086


Obvious efficiency improvement but not sure how to properly quantify this!

In [55]:
docs_dropped[9]['Final'].sum()

13984

In [58]:
docs_dropped[9]['Starting num'].sum()

300000

CONCLUSION: The problem with my method is that I am fairly certain the beir eval is using its respsected top_k number of docs to calculate recall/precision...not the actual number of documents returned as evidenced by the k=100 and k=1000 staying the same.

In [226]:
from haystack.pipelines import DocumentSearchPipeline, Pipeline
from haystack.nodes import TextConverter, BM25Retriever, EmbeddingRetriever, JoinDocuments, PreProcessor, PDFToTextConverter
from haystack.document_stores import InMemoryDocumentStore

text_converter = PDFToTextConverter()
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=350,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)
document_store = InMemoryDocumentStore()
retriever = EmbeddingRetriever(
    document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", top_k = 30
)
# rerank = DynamicRetrieverFinal(weight=weight)

index_pipeline = Pipeline()
index_pipeline.add_node(text_converter, name="TextConverter", inputs=["File"])
index_pipeline.add_node(preprocessor, name="Preprocessor", inputs=["TextConverter"])
index_pipeline.add_node(retriever, name="DenseRetriever", inputs=["Preprocessor"])
index_pipeline.add_node(document_store, name="DocumentStore", inputs=["DenseRetriever"])

query_pipeline = Pipeline()
# query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
query_pipeline.add_node(component=retriever, name="DenseRetriever", inputs=["Query"])

In [227]:
docs = index_pipeline.run(file_paths=['/content/2007341.pdf'])
len(docs['documents'])

282

In [228]:
document_store.get_document_count() == document_store.get_embedding_count()

True

Source document for these excerpts can be found at: https://nces.ed.gov/pubs2007/2007341.pdf

In [272]:
all_physics = ["""Physics courses involve the study of the forces and laws of nature affecting matter, such as
equilibrium, motion, momentum, and the relationships between matter and energy. The study of physics
includes examination of sound, light, and magnetic and electric phenomena.
03152 Physics—Advanced Studies
Usually taken after a comprehensive initial study of physics, Physics—Advanced Studies
courses provide instruction in laws of conservation, thermodynamics, and kinetics; wave and particle
phenomena; electromagnetic fields; and fluid dynamics.""","""Principles of Technology courses focus on the study of the forces and laws of nature and their
application to modern technology. Equilibrium, motion, momentum, energy conversion,
electromagnetism, and optical phenomena are presented in the context of current, real-world
applications. Demonstrations, math labs, and applied laboratory experiments are an integral part of the
Principles of Technology curriculum. These courses enable students to gain a solid foundation for
careers in electronics, robotics, telecommunications, and other technological fields.""",
"""
Designed by the College Board to parallel college-level physics courses that serve as a partial
foundation for science or engineering majors, AP Physics C courses primarily focus on 1) mechanics
and 2) electricity and magnetism, with approximately equal emphasis on these two areas. AP Physics C
courses are more intensive and analytical than AP Physics B courses and require the use of calculus to
solve the problems posed.
""","""
IB Physics courses prepare students to take the International Baccalaureate Physics exams at
either the Subsidiary or Higher level. In keeping with the general aim of IB Experimental Sciences
courses, IB Physics promotes understanding of the facts, patterns, and principles underlying the field of
physics; critical analysis, prediction, and application of scientific information and hypotheses; improved
ability to communicate scientific ideas; and an awareness of the impact of scientific advances in physics
upon both society and issues of ethical, philosophical, and political importance. Course content varies,
but includes the study of the fundamental laws of nature and the interaction between concepts of matter,
fields, waves, and energy. Laboratory experimentation is essential; calculus may be used in some
courses.
""","""
Physical Science courses involve study of the structures and states of matter. Typically (but not
always) offered as introductory survey courses, they may include such topics as forms of energy, wave
phenomenon, electromagnetism, and physical and chemical interactions.
""","""
IB Physical Science courses prepare students to take the International Baccalaureate Physical
Science exams at either the Subsidiary or Higher level. These courses integrate the study of physics and
chemistry, showing how the physical and chemical properties of materials can be explained and
predicted in terms of atomic, molecular, and crystal structures and forces. In keeping with the general
aim of IB Experimental Sciences courses, IB Physical Science courses promote critical analysis,
prediction, and application of scientific information and hypotheses; improved ability to communicate
scientific ideas; and an awareness of the impact of science and scientific advances upon both society and
issues of ethical, philosophical, and political importance. Students are required to develop and pursue an
individual, experimental project, which is evaluated as part of the IB exam.
""","""
Conceptual Physics courses introduce students to the use of chemicals, characteristic properties
of materials, and simple mechanics to better describe the world and nonliving matter. The courses
emphasize precise measurements and descriptive analysis of experimental results. Topics covered may
include energy and motion, electricity, magnetism, heat, the structure of matter, and how matter reacts to
materials and forces.
""","""
Particular Topics in Physics courses concentrate on a particular subtopic within the field of
physics (such as optics, thermodynamics, quantum physics, and so on) that is not otherwise described in
this classification system.
""","""
Physics—Independent Study courses, often conducted with instructors as mentors, enable
students to explore scientific topics of interest, using advanced methods of scientific inquiry and
experimentation. These courses may be offered in conjunction with other rigorous science courses or
may provide students with an opportunity to explore a topic of special interest.
""","""
Physics—Workplace Experience courses provide work experience in a field related to physics.
Goals are typically set cooperatively by the student, teacher, and employer (although students are not
necessarily paid). These courses may include classroom activities as well, involving further study of the
field or discussion regarding experiences that students encounter in the workplace.
""","""
Dance Technique courses provide students with experience in one or several dance forms (i.e.,
modern, jazz, ballet, and tap). Initial classes are usually introductory in nature, while the more advanced
classes concentrate on improving students’ technique and may offer or require experience in
choreography and dance evaluation.
""","""
Dance Repertory courses provide the opportunity for students with prior dance experience to
develop dance techniques in small groups; these classes require auditions and emphasize performance.
""","""
Expressive Movement courses help develop students’ ability to move expressively, without an
emphasis on particular dance forms or on developing specific dance techniques.
""","""
Dance Appreciation courses expand students’ knowledge of dance as an art form and help
develop students’ ability to evaluate dance performances. Learning the history of one or several dance
forms may also be included as a course objective.
""","""
Choreography courses teach students how to arrange and direct dancers’ movements. Course
content includes application of the elements and principles of dance, study of historical and
contemporary dance from a worldwide perspective, and instruction in critique. Course objectives
include developing an appreciation of dance as a communicative art form and self-expression. Students
sometimes gain performance experience.
""","""
Dance—Independent Study courses, often conducted with instructors or professional
dancers/choreographers as mentors, enable students to explore a particular dance form. Independent
Study courses may serve as an opportunity for students to expand their expertise in a particular form or
style, to explore a topic in greater detail, or to develop more advanced skills.
""","""
Dance—Workplace Experience courses provide students with work experience in a field related
to dance. Goals are typically set cooperatively by the student, teacher, and employer (although students
are not necessarily paid). These courses may include classroom activities as well, involving further
study of the field or discussion regarding experiences that students encounter in the workplace.
"""]
len(all_physics)

17

In [273]:
query_pipeline = Pipeline()
# query_pipeline.add_node(component=retriever, name="SparseRetriever", inputs=["Query"])
query_pipeline.add_node(component=retriever, name="DenseRetriever", inputs=["Query"])
results = query_pipeline.run(query='What are all the physics and dance classes?',params={"DenseRetriever": {"top_k": 10}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [274]:
len(results['documents'])

10

In [277]:
corpus_ = ""
for i in results['documents']:
    corpus_ += i.content
# Print true if all classes from all_math are present in corpus
count = 0
# We would "hope" to see 17
for i in all_physics:
    if i in corpus_:
        count+=1
print(count)

10


Still missed one important piece...

In [199]:
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
import numpy as np
import pandas as pd


class DynamicRetriever(BaseComponent):
    outgoing_edges = 1

    def __init__(self, weight):
        self.weight = weight
        self.docs_length = []

    def dynamic_top_k(self, weight, results,  min=5, max=100):
        #change top_k returned dynamically
        final = []
        # when next result score is less than the  previous score - one standard deviation, stop adding context
        # if len(results) < min:
        #     return results
        scores = [result.score for result in results]
        #if results len is greater than 20, calc std_dev on top 20 only
        # if len(results) > 20:
        #     scores = scores[:20]
            # std_dev = np.std(scores)
        # else:
        std_dev = np.std(scores)
        # sort results on score
        results = sorted(results, key=lambda x: x.score, reverse=True)
        print(std_dev)
        final = []
        for i in range(len(results)):
            if len(final) >= max:
                break
            if i > 0 and (results[i].score + (std_dev*weight)) < results[0].score: #0.35
                break
            final.append(results[i])
        print(f'Dropping {len(results) - len(final)} docs: ')
        print(f'Final length {len(final)}')
        self.docs_length.append({"Starting num": len(results), "Final": len(final)})
        return final

    def run(self, documents) -> tuple[dict[str, list[Document]], str]:
        documents = self.dynamic_top_k(self.weight, documents)
        output = {
            "documents": documents,
        }
        print(len(documents))
        return output, "output_1"

    def run_batch(self, documents) -> tuple[dict[str, list[Document]], str]:
        pass

In [299]:
rerank = DynamicRetriever(weight=3)
query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="DenseRetriever", inputs=["Query"])
query_pipeline.add_node(component=rerank, name="DynamicTopK", inputs=["DenseRetriever"])

In [300]:
results = query_pipeline.run(query='What are all the physics classes?', params={"DenseRetriever": {"top_k": 50}})
len(results['documents'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

0.003052613681645873
Dropping 20 docs: 
Final length 30
30


30

In [303]:
corpus = ""
for i in results['documents']:
    corpus += i.content
# Print true if all classes from all_math are present in corpus
count = 0
# Lets try this again (we want all 17)
for i in all_physics:
    if i in corpus:
        count+=1
print(count)

17


In [None]:
!pip install anthropic

In [302]:
from anthropic import Anthropic
tokenizer = Anthropic().get_tokenizer()
print(f'Initial tokens with incomplete context {tokenizer.encode(corpus_)}')
print(f'Tokens with complete context {tokenizer.encode(corpus)}')

Initial tokens with incomplete context Encoding(num_tokens=7386, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Tokens with complete context Encoding(num_tokens=17086, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


CONCLUSION 2: Inserting this in between the initial retrieval and your reranking step may prove more beneficial than without dynamic_top_k!