In [1]:
!python -m pip install redis tabulate haystack-ai google-ai-haystack sourcegraph==0.0.6 google-generativeai

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
[0mSuccessfully installed tabulate-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
%%capture
!python -m pip install --upgrade --force-reinstall protobuf

In [3]:
import os
from dotenv import load_dotenv
import numpy as np
import redis
from typing import List
from redis.commands.search.query import Query
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

True

In [5]:
client = redis.Redis(
  host=os.environ['REDIS_HOST'],
  port=os.environ['REDIS_PORT'],
  password=os.environ['REDIS_PASSWORD'])

In [6]:
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

def get_embeddings(content: List):
    return genai.embed_content(model='models/text-embedding-004',content=content)['embedding']

In [7]:
query = "Training new tokenizer takes a lot time to complete. Also memory consumption seems pretty high"

In [8]:
def draft_prompt(query: str, chat_history: str) -> str:
    """
    Perform a vector similarity search and retrieve related functions.

    Args:
        query (str): The input query to encode.

    Returns:
        str: A formatted string containing details of related functions.
    """
    INDEX_NAME = "idx:codes_vss"
    
    vector_search_query = (
        Query('(*)=>[KNN 2 @vector $query_vector AS vector_score]')
        .sort_by('vector_score')
        .return_fields('vector_score', 'id', 'name', 'definition', 'file_name', 'type', 'uses')
        .dialect(2)
    )
    
    encoded_query = get_embeddings(query)
    vector_params = {
        "query_vector": np.array(encoded_query, dtype=np.float32).tobytes()
    }
    
    result_docs = client.ft(INDEX_NAME).search(vector_search_query, vector_params).docs
    
    related_items: List[str] = []
    dependencies: List[str] = []
    for doc in result_docs:
        related_items.append(doc.name)
        if doc.uses:
            dependencies.extend(use for use in doc.uses.split(", ") if use)
    
    dependencies = list(set(dependencies) - set(related_items))
    
    def get_query(item_list):
        return Query(f"@name:({' | '.join(item_list)})").return_fields(
            'id', 'name', 'definition', 'file_name', 'type'
        )
    
    related_docs = client.ft(INDEX_NAME).search(get_query(related_items)).docs
    dependency_docs = client.ft(INDEX_NAME).search(get_query(dependencies)).docs
    
    def format_doc(doc):
        return (
            f"{'*' * 28} CODE SNIPPET {doc.id} {'*' * 28}\n"
            f"* Name: {doc.name}\n"
            f"* File: {doc.file_name}\n"
            f"* {doc.type.capitalize()} definition:\n"
            f"```python\n{doc.definition}\n```\n"
        )
    
    formatted_results_main = [format_doc(doc) for doc in related_docs]
    formatted_results_support = [format_doc(doc) for doc in dependency_docs]
    
    return (
        f"User Question: {query}\n\n"
        f"Current Chat History: \n{chat_history}\n\n"
        f"USE BELOW CODES TO ANSWER USER QUESTIONS.\n"
        f"{chr(10).join(formatted_results_main)}\n\n"
        f"SOME SUPPORTING FUNCTIONS AND CLASS YOU MAY WANT.\n"
        f"{chr(10).join(formatted_results_support)}"
    )