In [52]:

from typing import (    
    Any    
)


from conf.constants import QDRANT_KEY, QDRANT_URL
from openai import OpenAI
from qdrant_client import QdrantClient

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

from langchain_community.vectorstores.utils import maximal_marginal_relevance
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore


# create an embedding using openai
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   resp = OpenAI().embeddings.create(input = [text], model=model)
   return resp.data[0].embedding

def document_from_scored_point(        
        scored_point: Any,
        content_payload_key: str,
        metadata_payload_key: str,
    ) -> Document:
        return Document(
            page_content=scored_point.payload.get(content_payload_key),
            metadata=scored_point.payload.get(metadata_payload_key) or {},
        )
    
# query the vector store
def query_qdrant(query, collection_name, top_k=20):
    
    embedding = get_embedding(text=query)

    qdrant_client = QdrantClient(
        QDRANT_URL,
        api_key=QDRANT_KEY,
    )

    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=(embedding),
        with_payload=True,
        with_vectors=True,
        limit=top_k,
    )

    ## The MMR impl used with retriever(search_type='mmr')    
    embeddings = [result.vector for result in results]
    
    mmr_selected = maximal_marginal_relevance(
            np.array(embedding), embeddings, k=5, lambda_mult=0.5
        )
    
    return [
        (
            document_from_scored_point(
                scored_point=results[i], 
                content_payload_key="page_content", 
                metadata_payload_key="metadata"
            ),
            results[i].score,
        )
        for i in mmr_selected
    ]


In [57]:
COLLECTION = "quarkus_getting_started"
QUERY = "deploying Camel Quarkus application to Openshift"
NUM_RESULTS = 5
query_results = query_qdrant(        
    query=QUERY, 
    collection_name=COLLECTION,
    top_k=NUM_RESULTS
    )

doc = query_results[0][0]
print(str(doc))


df = pd.DataFrame(columns=['score', 'page_ref', 'entities', 'content'])

for i, article in enumerate(query_results):    
    doc = article[0]
    score = article[1]
    
    data = {        
        "score": round(score, 3),
        "page_ref": doc.metadata["page_number"],
        "entities": doc.metadata["entities"],
        "content": doc.page_content              
    }
    df_new_rows = pd.DataFrame(data, index=[i])
    df = pd.concat([df, df_new_rows])   
    
print("done")

page_content='"Red Hat build of Apache Camel\n \n4.0\nGetting Started with Red Hat build of Apache\nCamel for Quarkus\nGetting Started with Red Hat build of Apache Camel for Quarkus\nLast Updated: 2023-12-04"' metadata={'entities': '0. Camel component name: Quarkus\n1. Relevant technical concepts: Not specified\n3. Thirdparty services or tools: Red Hat', 'page_number': '0'}
done


# General structure

In [54]:
display(df.iloc[:NUM_RESULTS])

Unnamed: 0,score,page_ref,entities,content
0,0.866,0,0. Camel component name: Quarkus\n1. Relevant ...,"""Red Hat build of Apache Camel\n \n4.0\nGettin..."
1,0.857,24,0. Camel component name: Not specified\n1. Rel...,"""CHAPTER 2. DEPLOYING QUARKUS APPLICATIONS\nYo..."
2,0.846,23,0. Camel component name: Not specified\n1. Rel...,"""Camel Quarkus executes the production of bean..."
3,0.845,2,0. What's the Camel component name: Red Hat bu...,"""Red Hat build of Apache Camel\n \n4.0\n \nGet..."
4,0.847,7,1. What's the Camel component name: Not specif...,"""PREFACE\nMAKING OPEN SOURCE MORE INCLUSIVE\nR..."


# Sizings

In [27]:
contents = df['content'].tolist()
for i,item in enumerate(contents):    
    print("Item", str(i),": ", len(item), "bytes \n")

Item 0 :  2400 bytes 

Item 1 :  2347 bytes 

Item 2 :  2477 bytes 

Item 3 :  2166 bytes 

Item 4 :  2408 bytes 



# Entities

In [18]:
entities = df['entities'].tolist()
for i, item in enumerate(entities):        
    print("Item", str(i),": \n", item, "\n\n")



Item 0 : 
 0. Camel component name: Quarkus
1. Relevant technical concepts: Not specified
3. Thirdparty services or tools: Red Hat 


Item 1 : 
 0. Camel component name: Not specified
1. Relevant technical concepts: OpenShift, Docker build strategy, S2I Binary, Source S2I, Quarkus applications, OpenShift build strategies, Quarkus native executables, deployment strategy
3. Thirdparty services or tools: Not specified 


Item 2 : 
 0. What's the Camel component name: Red Hat build of Apache Camel for Quarkus
1. What are relevant technical concepts mentioned: Quarkus
3. What thirdparty services or tools are mentioned: Not specified 


Item 3 : 
 1. What's the Camel component name: Not specified
2. What are relevant technical concepts mentioned: Quarkus
3. What thirdparty services or tools are mentioned: Red Hat 


Item 4 : 
 0. Camel component name: Not specified
1. Relevant technical concepts mentioned: Quarkus, Camel QuarkusTestSupport, JUnit Jupiter, adviceWith, @Produces, Configuring r