In [1]:
import os
import re
from transformers import pipeline
from llama_index.core import SimpleDirectoryReader

transcript_directory = "/mnt/c/Users/edeep/RAG/RAG_Codebase/project3_se-final_with_openai/project3_se-final_with_openai/"

# Initialize the summarization pipeline with an explicitly specified model
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)

def generate_summary(code_content):
    # Define the maximum number of tokens
    max_tokens = 1024

    comments = re.findall(r'# (.+)', code_content)
    comments_text = ' '.join(comments)

    functions_with_returns = re.findall(r'def (\w+)\((.*?)\).*?:\s*return (.+)', code_content, re.DOTALL)
    function_calls_and_returns = [f"{name}({args}) returns {ret.strip()}" for name, args, ret in functions_with_returns]

    summary = ""
    
    # Check if the total tokens in the input exceed the max_tokens limit
    total_tokens = len(comments_text.split()) + sum(len(call_return.split()) for call_return in function_calls_and_returns)
    if total_tokens > max_tokens:
        return "Input exceeds maximum token limit, summarization skipped."

    # Dynamic max_length adjustment based on the input length
    def get_max_length(text):
        return min(130, max(30, len(text.split()) // 2))

    if comments_text:
        try:
            summary_comments = summarizer(comments_text, max_length=get_max_length(comments_text), min_length=30, do_sample=False)
            summary += summary_comments[0]['summary_text']
        except Exception as e:
            summary += " Error summarizing comments: " + str(e)
        
    for call_return in function_calls_and_returns:
        if len(call_return.split()) < max_tokens:
            try:
                summary_function = summarizer(call_return, max_length=get_max_length(call_return), min_length=30, do_sample=False)
                summary += ' ' + summary_function[0]['summary_text']
            except Exception as e:
                summary += " Error summarizing function returns: " + str(e)

    return summary if summary else "No data to summarize."

# Lambda function for metadata with error handling
filename_fn = lambda filename: {
    'file_path': os.path.join(transcript_directory, filename),
    'code_summary': generate_summary(open(os.path.join(transcript_directory, filename), 'r', errors='replace').read())
}

# Load documents with updated lambda function
documents = SimpleDirectoryReader(transcript_directory, filename_as_id=True, file_metadata=filename_fn, recursive=True).load_data()


  from .autonotebook import tqdm as notebook_tqdm
Your max_length is set to 30, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 30, but your input_length is only 11. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)
Your max_length is set to 30, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e

In [None]:
print(len(documents))

109


In [2]:

# find documents with unique file paths
unique_file_paths = set([doc.metadata['file_path'] for doc in documents])
print(len(unique_file_paths))
# print(unique_file_paths)


90


In [None]:


# get metadata for the first document
metadata = documents[108].metadata
print(metadata['file_path'])
print(metadata['code_summary'])


print(documents[108].text)


/mnt/c/Users/edeep/RAG/RAG_Codebase/project3_se-final_with_openai/project3_se-final_with_openai/vendor-service/run.py
 run.py from flask_migrate import Migrate migrate = Migrate(app, db) migrate migrate = migrate . migrate .  App. save_session(self, *args, **kwargs) returns jsonify({"status": "OK", 200)
# run.py
from application import create_app, db
from application import models
# from flask_migrate import Migrate

app = create_app()
# migrate = Migrate(app, db)
from flask import jsonify

from flask import g
from flask.sessions import SecureCookieSessionInterface
from flask_login import user_loaded_from_header


class CustomSessionInterface(SecureCookieSessionInterface):
    """Prevent creating session from API requests."""

    def save_session(self, *args, **kwargs):
        if g.get('login_via_header'):
            return
        return super(CustomSessionInterface, self).save_session(*args,
                                                                **kwargs)


app.session_in

In [3]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')


query = "Explain what product service does"
# Initialize the question-answering pipeline with an explicitly specified model
qa_pipeline = pipeline("summarization", model=model_name)




In [4]:
import torch    
from llama_index.core import VectorStoreIndex,  Settings, PromptTemplate
from llama_index.llms.ollama import Ollama  # Assuming you corrected the import here

# Retieve documents relevant to query based on meta data by filtering the documents
# vector for query and code summar or file path should be close
# to each other
def find_relevant_documents(documents, query):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    document_embeddings = embedder.encode([doc.metadata['code_summary'] for doc in documents], convert_to_tensor=True)
    # scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]
    # # scores = [(doc.metadata['code_summary'], util.pytorch_cos_sim(query_embedding, doc['embedding'])[0][0].item()) for doc in documents]
    # scores.sort(key=lambda x: x[1], reverse=True)  # Sort by score
    similarities = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

    # Get sorted scores and their indices in descending order
    sorted_scores, sorted_indices = torch.sort(similarities, descending=True)
    
    # Filter to retain only documents with a score greater than 0.5
    filtered_indices = sorted_indices[sorted_scores > 0.3]

    # Retrieve the corresponding documents
    relevant_docs = [documents[idx] for idx in filtered_indices]

    return relevant_docs

    # return [doc for doc, score in scores if score > 0.5]  # 

# now answer query based on actual content in the relevant documents
# append doc contents 
relevant_docs = find_relevant_documents(documents, query)
print("Relevant documents:")
print(relevant_docs)

print(len(relevant_docs))

# Concatenate contents of relevant documents
concatenated_content = " ".join(doc.text for doc in relevant_docs)


Relevant documents:
[Document(id_='/mnt/c/Users/edeep/RAG/RAG_Codebase/project3_se-final_with_openai/project3_se-final_with_openai/frontend/application/frontend/api/ProductClient.py', embedding=None, metadata={'file_path': '/mnt/c/Users/edeep/RAG/RAG_Codebase/project3_se-final_with_openai/project3_se-final_with_openai/frontend/application/frontend/api/ProductClient.py', 'code_summary': ' application/frontend/api/ProductClient.py product = response.json() product = product . Product is a product of a'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# application/frontend/api/ProductClient.py\nimport requests\n\n\nclass ProductClient:\n\n    @staticmethod\n    def get_products(vendor_id=None, min_price=None, max_price=None):\n        params = {}\n       

In [5]:
from llama_index.core import VectorStoreIndex, PromptTemplate
from llama_index.llms.ollama import Ollama  
import os
from langchain.embeddings import HuggingFaceEmbeddings  # Correct class from HuggingFace
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core import Settings, VectorStoreIndex, PromptTemplate, SimpleDirectoryReader
from llama_index.llms.ollama import Ollama  # Assuming you corrected the import here

# print("Answer: ", answer['answer'], "Score: ", answer['score'])

# Create query pipeline 
# create index of concatenated content
# search index for query
# return relevant content
# Indexing documents
# Load the embedding model
def load_embedding_model(model_name="sentence-transformers/all-mpnet-base-v2", device="cuda"):
    model_kwargs = {"device": device}
    encode_kwargs = {"normalize_embeddings": True}
    return HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)
Settings.embed_model = embed_model

index = VectorStoreIndex.from_documents(relevant_docs)

# Setting up LLM and querying capabilities
llm = Ollama(model="mistral", request_timeout=60.0)
Settings.llm = llm
query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)

# Template for queries
qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information above, I want you to think step by step to answer the query in a crisp manner, in case you don't know the answer say 'I don't know!'.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts({"response_synthesizer:text_qa_template": qa_prompt_tmpl})

query = "Explain what product service does"
# Querying the index
response = query_engine.query(query)
print(response)

 product service is one of the microservices in the given Python Flask based project. Its primary function is to handle all the product-related operations such as creating, retrieving, updating, and deleting products. Users can interact with this service through API endpoints provided by it. In the context information above, you can see that the product database is populated using the product-service API endpoints.


In [6]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Use the PrettyPrinter to print the data
pp.pprint(response.response_txt)

(' product service is one of the microservices in the given Python Flask based '
 'project. Its primary function is to handle all the product-related '
 'operations such as creating, retrieving, updating, and deleting products. '
 'Users can interact with this service through API endpoints provided by it. '
 'In the context information above, you can see that the product database is '
 'populated using the product-service API endpoints.')
