In [9]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from datetime import datetime
from openai import OpenAI
from semantic_router.encoders import OpenAIEncoder
from semantic_router.splitters import RollingWindowSplitter
from semantic_router.utils.logger import logger
import os
import uuid
import cohere
from openai import AzureOpenAI
import anthropic
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

AZURE_OPENAI_API_KEY = ""
AZURE_OPENAI_ENDPOINT = ""
AZURE_OPENAI_VERSION = ""


azure_client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_VERSION,
    azure_deployment="",
)

azure_client_embedding = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_VERSION,
    azure_deployment="",
)

os.environ["OPENAI_API_KEY"] = ""
os.environ["ANTHROPIC_API_KEY"] = ""

co = cohere.Client("")
openai_client = OpenAI()
qdrant_client = QdrantClient(
    url="",
    api_key=""
)
anthropic_client = anthropic.Anthropic()

embedding_model = "text-embedding-3-small"

encoder = OpenAIEncoder(name=embedding_model)

In [10]:
# Define your collection name
collection_name = "eric_op_rag"

try:
    qdrant_client.delete_collection(collection_name=collection_name)
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
    )
except Exception as e:
    print(e)

### data scraping

### web content

In [11]:
from langchain_community.document_loaders import WebBaseLoader
import os
os.environ['USER_AGENT'] = 'myagent'

web_url = "https://google.github.io/styleguide/pyguide.html"

loader = WebBaseLoader(web_url)
web_doc = loader.load()[0].page_content

### data chunking

In [12]:
splitter = RollingWindowSplitter(
    encoder=encoder,
    min_split_tokens=100,
    max_split_tokens=500,
    window_size=5,  # sentences
    dynamic_threshold=True,
)


DOCUMENT_CONTEXT_PROMPT = """
<document>
{doc_content}
</document>
"""

CHUNK_CONTEXT_PROMPT = """
Here is the chunk we want to situate within the whole document
<chunk>
{chunk_content}
</chunk>

Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.
Answer only with the succinct context and nothing else.
"""

def situate_context(doc: str, chunk: str) -> str:
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        
        max_tokens=1024,
        
        temperature=0.0,
        
        messages=[
            {"role": "user", "content": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc)}, 
            {"role": "user", "content": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk)} 
        ],
    ).choices[0].message.content
    
    return response


def process_split(i, split, url, data_content, encoder, doc_id):
    
    
    # Situate context
    contextualized_text = situate_context(data_content, split.content)
    
    print(contextualized_text)
    
    text_to_embed = f"{split.content}\n\n{contextualized_text}"
    
    # Encode the content
    vector = encoder([text_to_embed])[0]
    
    # Create metadata
    meta_data = {
        "doc_id": doc_id,
        "url": url,
        "chunk_index": i,
        "original_content": str(split.content),
        "contextualized_content": str(contextualized_text),
    }
    
    # Return the vector and metadata
    return vector, meta_data


def preprocess_doc_for_qdrant(url, data_content, doc_id):
    
    splits = splitter([data_content])

    print("total chunks:", len(splits))
    print("\n")

    vectors = []
    payload = []

    # Set up the ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        # Use tqdm to show progress bar for task submission
        futures = [
            executor.submit(process_split, i, split, url, data_content, encoder, doc_id)
            for i, split in tqdm(enumerate(splits, start=1), total=len(splits), desc="Processing splits")
        ]
        
        # Use tqdm to show progress bar for task completion
        for future in tqdm(as_completed(futures), total=len(futures), desc="Collecting results"):
            vector, meta_data = future.result()
            vectors.append(vector)
            payload.append(meta_data)

    print(len(vectors))
    print(len(payload))
    
    return vectors, payload

https://github.com/aurelio-labs/semantic-chunkers
  splitter = RollingWindowSplitter(


In [13]:
doc_id = str(uuid.uuid4())
vectors, payload = preprocess_doc_for_qdrant(web_url, web_doc, doc_id)
print(len(vectors))
print(len(payload))

[32m2024-10-05 15:15:08 INFO semantic_router.utils.logger Single document exceeds the maximum token limit of 500. Splitting to sentences before semantically splitting.[0m


total chunks: 126




Processing splits:   0%|          | 0/126 [00:00<?, ?it/s]

Collecting results:   0%|          | 0/126 [00:00<?, ?it/s]

This chunk is part of Section 2.4, which discusses the use of exceptions in Python programming, outlining the pros and cons of using exceptions, as well as specific guidelines and decisions regarding their implementation and usage.
This chunk is part of the "Imports" section in the Google Python Style Guide, which outlines best practices for importing modules and packages in Python code, emphasizing the use of complete module names for clarity and avoiding ambiguity in imports.
This chunk is part of the "True/False Evaluations" section in the Google Python Style Guide, which discusses best practices for using implicit boolean evaluations in Python code, including examples of preferred and discouraged coding patterns.
This chunk is part of the Google Python Style Guide, specifically covering sections on generics, build dependencies, and concluding remarks, followed by the introduction that outlines the purpose and usage of Python at Google, including guidelines for code formatting and l

In [14]:
qdrant_client.upload_collection(
        collection_name=f"{collection_name}",
        payload=payload,
        vectors=vectors,
        max_retries=3,
        parallel=4
    )

### data retrieving

In [63]:
query = "What is pylint?"

similarity_top_k = 100
reranker_top_n = 20

In [64]:
text = query.replace("\n", " ")


embedding = openai_client.embeddings.create(input = [text], model="text-embedding-3-small").data[0].embedding
print(embedding)


retrieval_data = qdrant_client.search(
    collection_name=collection_name,
    query_vector=embedding,
    limit=similarity_top_k,
)

print(retrieval_data)
print(len(retrieval_data))

for key, value in retrieval_data[0].payload.items():
    print(key)

[-0.007412021514028311, -0.01990324817597866, -0.03642036020755768, -0.02863999828696251, 0.007218158338218927, 0.003909566439688206, -0.02874339185655117, 0.01863667741417885, -0.025318482890725136, 0.0021825721487402916, -0.005518627818673849, 0.0015218229964375496, -0.005693104583770037, -0.02560281567275524, 0.00042205548379570246, 0.044924475252628326, 0.017034078016877174, -0.020885486155748367, -0.0875742956995964, 0.03112144209444523, 0.018830541521310806, 0.021040577441453934, -0.03218122571706772, 0.029984114691615105, 0.028355667367577553, -0.0269598551094532, 0.013156822882592678, -0.016051840037107468, -0.005693104583770037, 0.00034067349042743444, 0.021544620394706726, -0.02560281567275524, -0.029958266764879227, 0.02318599261343479, -0.0016131001757457852, -0.049060214310884476, 0.015883825719356537, 0.010701227001845837, 0.03851407766342163, -0.000177808542503044, -0.032672345638275146, -0.03321516141295433, 0.016336172819137573, -0.018171407282352448, -0.02098887972533

In [65]:
#post processing

docs = []
chunk_indexes = []
urls = []
original_contents = []
doc_ids = []
for doc in retrieval_data:
    print(doc)
    
    original_content = doc.payload["original_content"]
    contextualized_content = doc.payload["contextualized_content"]
    url = doc.payload["url"]
    chunk_index = doc.payload["chunk_index"]
    
    docs.append(f"{original_content}\n\n{contextualized_content}")
    chunk_indexes.append(chunk_index)
    urls.append(url)
    original_contents.append(original_content)
    doc_ids.append(doc.payload["doc_id"])


id='beb7f88d-ed28-4b71-bd74-56c221cda9ff' version=1 score=0.40880078 payload={'chunk_index': 2, 'contextualized_content': 'This chunk is part of the Google Python Style Guide, specifically covering sections on generics, build dependencies, and concluding remarks, followed by the introduction that outlines the purpose and usage of Python at Google, including guidelines for code formatting and linting practices.', 'doc_id': '50da1b5d-9540-4d0f-ab52-fdecf075fc80', 'original_content': '3.19.15 Generics 3.19.16 Build Dependencies 4 Parting Words 1 Background Python is the main dynamic language used at Google. This style guide is a list of dos and don’ts for Python programs. To help you format code correctly, we’ve created a settings file for Vim. For Emacs, the default settings should be fine. Many teams use the Black or Pyink auto-formatter to avoid arguing over formatting. 2 Python Language Rules 2.1 Lint Run pylint over your code using this pylintrc. 2.1.1 Definition pylint is a tool for

### cohere reranking

In [66]:
cohere_threshold = 0.3

results = co.rerank(
    model="rerank-english-v3.0",
    query=query,
    documents=docs,
    top_n=reranker_top_n,
    # rank_fields=[]
)

print(results)

rerank_docs = []
rerank_chunk_indexes = []
rerank_urls = []
rerank_original_contents = []
rerank_doc_ids = []
for hit in results.results:
    
    if hit.relevance_score < cohere_threshold:
        break
    
    rerank_docs.append(docs[hit.index])
    rerank_chunk_indexes.append(chunk_indexes[hit.index])
    rerank_urls.append(urls[hit.index])
    rerank_original_contents.append(original_contents[hit.index])
    rerank_doc_ids.append(doc_ids[hit.index])
    
print(len(rerank_docs))

5


### order preserving

In [67]:
from collections import defaultdict

# Step 1: Initialize dictionaries to hold merged results by file UUID
merged_docs = defaultdict(list)  # Will store docs for each file UUID
merged_chunk_indexes = defaultdict(list)
merged_file_names = {}
merged_file_paths = {}

# Step 2: Iterate through the rerank results and group docs by file_uuid
for i in range(len(rerank_doc_ids)):
    file_uuid = rerank_doc_ids[i]

    # Append the doc and chunk indexes to the respective file UUID entry
    merged_docs[file_uuid].append((rerank_chunk_indexes[i], rerank_original_contents[i]))  # Store as tuple (chunk_index, doc)
    merged_chunk_indexes[file_uuid].append(rerank_chunk_indexes[i])

    # Store file name and path for each file UUID (since it's the same for the whole file)
    merged_file_names[file_uuid] = rerank_urls[i]
    merged_file_paths[file_uuid] = rerank_urls[i]

# Step 3: Combine docs for each file UUID based on sorted chunk indexes
final_docs = []
final_chunk_indexes = []
final_file_paths = []

for file_uuid, doc_data in merged_docs.items():
    # Sort docs by their chunk index
    sorted_docs = sorted(doc_data, key=lambda x: x[0])  # Sort by chunk_index

    # Combine docs into one string after sorting, adding 'line {chunk_index}' at the start of each chunk
    combined_doc = "\n".join([f"Line {chunk_index}: {doc}" for chunk_index, doc in sorted_docs])
    
    # Collect sorted chunk indexes
    sorted_chunk_indexes = [chunk_index for chunk_index, _ in sorted_docs]

    # Append to the final lists
    final_docs.append(combined_doc)
    final_chunk_indexes.append(sorted_chunk_indexes)
    final_file_paths.append(merged_file_paths[file_uuid])

# Step 4: Print or return the final merged results
print(len(final_docs))
final_docs_string = ""
for i in range(len(final_docs)):
    print(f"Path: {final_file_paths[i]}")
    print(f"Chunks: {final_chunk_indexes[i]}")
    print(f"Document:\n{final_docs[i]}")
    print("------------")
    
    final_docs_string += f"Url: {final_file_paths[i]} has the following content:\n\n{final_docs[i]}\n\n--- split line for next file ---\n\n"
    


1
Path: https://google.github.io/styleguide/pyguide.html
Chunks: [2, 3, 4, 5, 108]
Document:
Line 2: 3.19.15 Generics 3.19.16 Build Dependencies 4 Parting Words 1 Background Python is the main dynamic language used at Google. This style guide is a list of dos and don’ts for Python programs. To help you format code correctly, we’ve created a settings file for Vim. For Emacs, the default settings should be fine. Many teams use the Black or Pyink auto-formatter to avoid arguing over formatting. 2 Python Language Rules 2.1 Lint Run pylint over your code using this pylintrc. 2.1.1 Definition pylint is a tool for finding bugs and style problems in Python source code.
Line 108: pylint allows you to move the closing parenthesis to a new line and align with the opening one, but this is less readable. No: def my_method(self, other_arg: MyLongType | None, ) -> dict[OtherLongType, MyLongType]: ... As in the examples above, prefer not to break types. However, sometimes they are too long to be on a 

In [68]:
print(final_docs_string)

Url: https://google.github.io/styleguide/pyguide.html has the following content:

Line 2: 3.19.15 Generics 3.19.16 Build Dependencies 4 Parting Words 1 Background Python is the main dynamic language used at Google. This style guide is a list of dos and don’ts for Python programs. To help you format code correctly, we’ve created a settings file for Vim. For Emacs, the default settings should be fine. Many teams use the Black or Pyink auto-formatter to avoid arguing over formatting. 2 Python Language Rules 2.1 Lint Run pylint over your code using this pylintrc. 2.1.1 Definition pylint is a tool for finding bugs and style problems in Python source code.
Line 108: pylint allows you to move the closing parenthesis to a new line and align with the opening one, but this is less readable. No: def my_method(self, other_arg: MyLongType | None, ) -> dict[OtherLongType, MyLongType]: ... As in the examples above, prefer not to break types. However, sometimes they are too long to be on a single line

#### correlation enhancement

#### dfs bfs retrieval

### llm inference

In [57]:
from pydantic import BaseModel
import json


class QAResponse(BaseModel):
    final_answer: str
    urls: str
    
    
user_prompt = """
# Role
An Intelligent AI QA Assistant. You are supportive, helpful, logical, and careful.

# Workflow
1. Understand user's questions.
2. Find the answer in the extracted context.
3. Generate the thoughts, answer.
4. Discuss your thoughts, answer with 3 other experts in your team and have a step-by-step discussion with them.
5. Finalize the answer and response the user.

## Rules
1. Your response should be factual, precise, and concise.
2. Output the response in the below json format:
{{
    "final_answer": place your final detailed answer in a string format here,
    "urls": only list the filenames containing the answers with "," separator in a string format here
}}

## Initialization
As <Role>, please follow <Rules> strictly to solve the problem.
Your task is to solve the problem following <Workflow>.
Here is the extracted context by scanning and skimming. Notice that some of these information contain noise so you should trust them with caution:
```
{docs}
```

Problem: My question is: "{question}". Please also list all filenames which contain the answer.
Answer: To make sure to get the best answer, you are in a step-by-step discussion with 3 other experts to solve the problem.
"""

user_prompt = user_prompt.format(
    docs=final_docs_string,
    question=query
)

In [58]:
messages = [
    {"role": "user", "content": user_prompt},
    ]

response = openai_client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=messages,
    temperature=0.0,
    max_tokens=16000,
    response_format=QAResponse
    ).choices[0].message.content

response = json.loads(response)


In [59]:
print(response["final_answer"])



In [61]:
print(response["urls"])

https://google.github.io/styleguide/pyguide.html
