In [1]:
# Loader
import glob
import os

path = './test_repo'
code_files = glob.glob(os.path.join(path, '**/*.py'), recursive=True)
code_files = [f for f in code_files if 'non-utf8-encoding.py' not in f]

print(code_files)

['./test_repo/corporate.py', './test_repo/user_management.py', './test_repo/user_utility.py']


In [2]:
# Parser
import os
from constants import Language
from treesitter.treesitter import Treesitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import AutoTokenizer


programming_language = "python"
chunk_size = 30
chunk_overlap = 0
documents = []
tokenizer = AutoTokenizer.from_pretrained('hkunlp/instructor-large')

for code_file in code_files:
    with open(code_file, "r", encoding="utf-8") as file:
        file_bytes = file.read().encode()

        treesitter_parser = Treesitter.create_treesitter(Language.PYTHON)
        treesitterNodes = treesitter_parser.parse(file_bytes)

        for node in treesitterNodes:
            method_source_code = node.method_source_code
            filename = os.path.basename(code_file)

            # # Character as unit
            # code_splitter = RecursiveCharacterTextSplitter.from_language(
            #     language=programming_language,
            #     chunk_size=chunk_size,
            #     chunk_overlap=chunk_overlap,
            # )

           # # Tokens as unit
            code_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                tokenizer,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
            )
            
            splitted_documents = code_splitter.split_text(method_source_code)
            for splitted_document in splitted_documents:

                # length = len(tokenizer(splitted_document))
                # print(length)
                # print(node.name)
                # print ("-------------------")
                document = Document(
                    page_content=splitted_document,
                    metadata={
                        "filename": filename,
                        "method_name": node.name,
                    },
                )
                documents.append(document)


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors


In [3]:
# Embedding
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")

  embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-xl")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# Index: FAISS
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(documents, embeddings)
vector_retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 3})

In [5]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 3


In [6]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[vector_retriever, keyword_retriever], weights=[0.7, 0.3])

In [7]:
# Similarity Search
question = "gs ip whitelist"
docs = db.similarity_search(question, k=5)

docs

[Document(metadata={'filename': 'user_utility.py', 'method_name': 'blocked_for_gs'}, page_content='GS IP addresses or logged-in GS users.\n    """'),
 Document(metadata={'filename': 'user_utility.py', 'method_name': 'blocked_for_gs'}, page_content='if current_ip_address_is_gs() or current_user.is_gs:'),
 Document(metadata={'filename': 'user_utility.py', 'method_name': 'current_ip_address_is_gs'}, page_content='"""Return whether the current request is coming from a GS IP address."""'),
 Document(metadata={'filename': 'user_utility.py', 'method_name': 'current_ip_address_is_gs'}, page_content='def current_ip_address_is_gs():'),
 Document(metadata={'filename': 'user_utility.py', 'method_name': 'current_ip_address_is_gs'}, page_content='for gs_address in GS_ADDRESSES:')]

In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory


prompt = ChatPromptTemplate.from_template(
      """
      <s> [INST] You are an expert programmer for 
      question-answering tasks. Use the following pieces of retrieved
      context to answer the question, and use relevant code snippet to help answer where possible. 
      If you don't know the answer, just say that you don't know. 
      For abbreviated name, do not guess what the abbreviated name is using external knowledge. Just state as it is.
      Use three sentences maximum and keep the answer concise.[/INST] </s> 
      [INST] Question: {question} 
      Context: {context} 
      Answer: [/INST]
      """
      )

chat_model = ChatOllama(model="codellama:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), temperature=0)

memory = ConversationSummaryMemory(
    llm=chat_model, memory_key="chat_history", return_messages=True
)

qa = ConversationalRetrievalChain.from_llm(
    chat_model,
    retriever=ensemble_retriever
)


result = qa({'question': "explain what the function current_ip_address_is_gs is doing", 'chat_history': []})


  memory = ConversationSummaryMemory(
  result = qa({'question': "explain what the function current_ip_address_is_gs is doing", 'chat_history': []})


The `current_ip_address_is_gs` function appears to be checking whether the IP address of the current request is from a Google Server (GS) IP address. It does this by comparing the IP address against a list of known GS IP addresses, which are stored in the `IPv4Network` object with the prefix `119.42.139.0/24`.

The function returns a boolean value indicating whether the current request is coming from a GS IP address or not. If the IP address matches one of the known GS IP addresses, it will return `True`, otherwise it will return `False`.

It's worth noting that this function appears to be part of a larger system for managing user accounts and access control. The comment suggests that the function is used to re-activate a user's corp account if they are coming from a GS IP address, which may indicate that it is being used in some kind of authentication or authorization process.

In [9]:

# from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
# from langchain_community.llms import Ollama
# from langchain.chains import ConversationalRetrievalChain
# from langchain.memory import ConversationSummaryMemory


# chat_model = Ollama(
#                 base_url="http://localhost:11434",
#                 model="codellama:13b",
#                 callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
#             )

# memory = ConversationSummaryMemory(
#     llm=chat_model, memory_key="chat_history", return_messages=True
# )

# qa = ConversationalRetrievalChain.from_llm(
#     chat_model,
#     retriever=retriever
# )

# result = qa({'question': "how do I add user?", 'chat_history': []})
# print(result['answer'])

In [10]:
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_ollama.llms import OllamaLLM


# B_INST, E_INST = "[INST]", "[/INST]"
# B_SYS, E_SYS = "<<SYS»>\n", "\n<</SYS>>\n\n"

# system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
# Read the given context before answering questions and think step by step. If you can not answer a user question based on the provided context, inform the user. Do not use any other information for answering user"""
    
# instruction = """
# Context: {context}
# User: {question}"""

# def prompt_format(instruction=instruction, system_prompt=system_prompt):
#     SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
#     prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
#     return prompt_template

# template = prompt_format()

# prompt = ChatPromptTemplate.from_template(template)

# model = OllamaLLM(model="codellama:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), temperature=0)

# memory = ConversationSummaryMemory(
#     llm=chat_model, memory_key="chat_history", return_messages=True
# )

# qa = ConversationalRetrievalChain.from_llm(
#     model,
#     retriever=retriever
# )

# result = qa({'question': "how do I add user?", 'chat_history': []})
# # result['answer']
