In [1]:

from typing import (    
    Any    
)


from conf.constants import QDRANT_KEY, QDRANT_URL
from openai import OpenAI
from qdrant_client import QdrantClient

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

from langchain_community.vectorstores.utils import maximal_marginal_relevance
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore


# create an embedding using openai
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   resp = OpenAI().embeddings.create(input = [text], model=model)
   return resp.data[0].embedding

def document_from_scored_point(        
        scored_point: Any,
        content_payload_key: str,
        metadata_payload_key: str,
    ) -> Document:
        return Document(
            page_content=scored_point.payload.get(content_payload_key),
            metadata=scored_point.payload.get(metadata_payload_key) or {},
        )
    
# query the vector store
def query_qdrant(query, collection_name, fetch_k=20, top_k=5):
    
    embedding = get_embedding(text=query)

    qdrant_client = QdrantClient(
        QDRANT_URL,
        api_key=QDRANT_KEY,
    )

    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(embedding),
        with_payload=True,
        with_vectors=True,
        limit=fetch_k,
    )

    ## The MMR impl used with retriever(search_type='mmr')    
    embeddings = [result.vector for result in results]
    
    mmr_selected = maximal_marginal_relevance(
            np.array(embedding), embeddings, k=top_k, lambda_mult=0.75
        )
    
    return [
        (
            document_from_scored_point(
                scored_point=results[i], 
                content_payload_key="page_content", 
                metadata_payload_key="metadata"
            ),
            results[i].score,
        )
        for i in mmr_selected
    ]


In [2]:
COLLECTION = "spring_reference_2"
QUERY = "streaming upload mode"
NUM_RESULTS = 5
query_results = query_qdrant(        
    query=QUERY, 
    collection_name=COLLECTION,
    top_k=NUM_RESULTS
    )

df = pd.DataFrame(columns=['score', 'page_ref', 'entities', 'content', 'content_size'])

for i, article in enumerate(query_results):    
    doc = article[0]
    score = article[1]
    
    data = {        
        "score": round(score, 3),
        "page_ref": doc.metadata["page_number"],
        "entities": doc.metadata["entities"],
        "content": doc.page_content,
        "content_size": len(doc.page_content)              
    }
    df_new_rows = pd.DataFrame(data, index=[i])
    df = pd.concat([df, df_new_rows])   
    
print("done")

done


# General structure

In [3]:
display(df.iloc[:NUM_RESULTS])

Unnamed: 0,score,page_ref,entities,content,content_size
0,0.788,/page_206.txt,The top 20 entities mentioned in the given con...,"""The default size for a batch is 1 Mb, but you...",2038
1,0.757,/page_497.txt_0,The top 20 entities mentioned in the given con...,"""Raw Mode:\n Attachments are not supported as ...",2483
2,0.78,/page_196.txt,Here are the top 20 entities mentioned in the ...,"""restartingPolicy\n(producer)\nThe restarting ...",1604
3,0.761,/page_660.txt,"Based on the context provided, the top 20 enti...","""Name\nDescription\nDefaul\nt\nType\nbinary\n ...",1717
4,0.752,/page_711.txt,The top 20 entities mentioned in the given con...,"""producerStrategy\n(producer)\nThe mode used t...",1557


# Entities

In [4]:
entities = df['entities'].tolist()
for i, item in enumerate(entities):        
    print("Item", str(i),": \n", item, "\n\n")

Item 0 : 
 The top 20 entities mentioned in the given context are:
1. S3 bucket
2. producer route
3. upload
4. progressive naming strategy
5. keyname
6. batch
7. restartingPolicy
8. files
9. messages
10. streaming upload
11. endpoint
12. exchange
13. NPE (NullPointerException)
14. AWS2S3EndpointBuilder
15. batchMessageNumber
16. lastPart
17. 1 Mb
18. 20 messages
19. 70 messages
20. 25 messages

Summary:
The context discusses the configuration and usage of the AWS S3 storage service with a focus on streaming upload, batch sizes, and naming strategies. It provides a detailed example of starting, stopping, restarting, and overriding the uploading process while creating and managing files within the S3 bucket. Additionally, it emphasizes the importance of specific configurations, such as keyName and restartingPolicy, in the context of the streaming upload mode. 


Item 1 : 
 The top 20 entities mentioned in the given context are:

1. CXF_RAW Mode
2. MTOM
3. Camel Message APIs
4. SOAPMessag

In [9]:
print(df['content'].tolist()[0])

"The default size for a batch is 1 Mb, but you can adjust it according to your requirements.
When you’ll stop your producer route, the producer will take care of flushing the remaining buffered
messaged and complete the upload.
In Streaming upload you’ll be able restart the producer from the point where it left. It’s important to note
that this feature is critical only when using the progressive naming strategy.
By setting the restartingPolicy to lastPart, you will restart uploading files and contents from the last part
number the producer left.
Example
1
. 
Start the route with progressive naming strategy and keyname equals to camel.txt, with
batchMessageNumber equals to 20, and restartingPolicy equals to lastPart - Send 70
messages.
2
. 
Stop the route
3
. 
On your S3 bucket you should now see 4 files: * camel.txt
camel-1.txt
camel-2.txt
camel-3.txt
The first three will have 20 messages, while the last one only 10.
4
. 
Restart the route.
5
. 
Send 25 messages.
6
. 
Stop the route.
7