In [1]:
# Loader
import glob
import os

path = './test_repo'
code_files = glob.glob(os.path.join(path, '**/*.py'), recursive=True)
code_files = [f for f in code_files if 'non-utf8-encoding.py' not in f]

print(code_files)

['./test_repo/user_utils.py', './test_repo/corporate.py', './test_repo/user_management.py']


In [2]:
# Code Parser
import os
from constants import Language
from treesitter.treesitter import Treesitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import AutoTokenizer


programming_language = "python"
chunk_size = 24 # tokens
chunk_overlap = 0
documents = []
tokenizer = AutoTokenizer.from_pretrained('hkunlp/instructor-large')

for code_file in code_files:
    with open(code_file, "r", encoding="utf-8") as file:
        file_bytes = file.read().encode()

        treesitter_parser = Treesitter.create_treesitter(Language.PYTHON)
        treesitterNodes = treesitter_parser.parse(file_bytes)

        for node in treesitterNodes:
            method_source_code = node.method_source_code
            filename = os.path.basename(code_file)

           # # Tokens as unit
            code_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                tokenizer,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
            )
            splitted_documents = code_splitter.split_text(method_source_code)
            for splitted_document in splitted_documents:
                # print(splitted_document)
                # print ("-------------------")
                document = Document(
                    page_content=splitted_document,
                    metadata={
                        "filename": filename,
                        "method_name": node.name,
                    },
                )
                documents.append(document)


  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors


In [3]:
# Normal Embedding
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")

  embeddings = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
# # Instruction Embedding

# from InstructorEmbedding import INSTRUCTOR

# embeddings = INSTRUCTOR('hkunlp/instructor-xl')

# codes = documents
# instruction = "Represent the Python codes:"

# embeddings.encode([[instruction, codes]])

documents

[Document(metadata={'filename': 'user_utils.py', 'method_name': 'LOG'}, page_content='LOG: structlog.stdlib.BoundLogger ='),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'LOG'}, page_content='structlog.get_logger()'),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'FORGOT_PW_PREFIX'}, page_content='FORGOT_PW_PREFIX ='),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'FORGOT_PW_PREFIX'}, page_content="'forgot_pw:'"),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'GS_ADDRESSES'}, page_content='GS_ADDRESSES = (\n    # Americas'),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'GS_ADDRESSES'}, page_content="IPv4Network('12.47.208.0/24'),"),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'GS_ADDRESSES'}, page_content="IPv4Network('69.74.251.0/24'),"),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'GS_ADDRESSES'}, page_content="IPv4Network('199.29.247.0/27'),"),
 Documen

In [5]:
# Index: FAISS
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(documents, embeddings)
vector_retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 8})

In [6]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
keyword_retriever = BM25Retriever.from_documents(documents)
keyword_retriever.k = 8

In [7]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[vector_retriever, keyword_retriever], weights=[0.5, 0.5])

In [8]:
# Similarity Search
question = "gs ip whitelist"
docs = db.similarity_search(question, k=5)

docs

[Document(metadata={'filename': 'user_utils.py', 'method_name': 'current_ip_address_is_gs'}, page_content='def current_ip_address_is_gs():'),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'blocked_for_gs'}, page_content='current_ip_address_is_gs() or'),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'current_ip_address_is_gs'}, page_content='for gs_address in GS_ADDRESSES:'),
 Document(metadata={'filename': 'corporate.py', 'method_name': 'reset_gs_user_password'}, page_content="'gs':"),
 Document(metadata={'filename': 'user_utils.py', 'method_name': 'blocked_for_gs'}, page_content="LOG.info(\n                'blocked_for_gs',")]

In [9]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory


prompt = ChatPromptTemplate.from_template(
      """
      <s> [INST] You are an expert programmer for 
      question-answering tasks. Use the following pieces of retrieved
      context to answer the question, and use relevant code snippet to help answer where possible. 
      If you don't know the answer, just say that you don't know.
      Use retrieved code snippet to help explaining when appropriate.
      Unless explicitly stated in the context, do not make up words for abbreviated names.
      Use three sentences maximum and keep the answer concise.[/INST] </s> 
      [INST] Question: {question} 
      Context: {context} 
      Answer: [/INST]
      """
      )

chat_model = ChatOllama(model="codellama:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), temperature=0)

memory = ConversationSummaryMemory(
    llm=chat_model,
    temperature=0,
    memory_key="chat_history", 
    return_messages=True,
)

qa = ConversationalRetrievalChain.from_llm(
    chat_model,
    retriever=ensemble_retriever,
    # memory=memory
)


result = qa({'question': "Explain GS Addresses", 'chat_history': []})


  memory = ConversationSummaryMemory(
  result = qa({'question': "Explain GS Addresses", 'chat_history': []})



GS Addresses refer to the IP addresses or logged-in users of Goldsman Sachs (GS) in the context of a Python script. The script is using the `GS_ADDRESSES` variable, which contains a tuple of IP addresses and user names for active GS users. The script is generating new passwords for these users to support cases where GS users need to access certain resources or systems.

The script is also using the `corp_plan_id='gs'` argument to select endpoints for GS masked users. This argument is used to specify the type of user account being created, with 'gs' indicating a non-IBD (Individual Broker Dealer) GS user.

The script is also using the `today = TODAY()` function to get the current date and time, which is then used to generate new passwords for the active GS users.

The function that must be called to update the user's email is `update_email`. This function takes two arguments: `old_email` and `new_email`, which are the current and new email addresses of the user, respectively. The function updates the user's email in both the database and Stripe.

It's important to note that this function is only called when a user's email address needs to be updated, such as when a user changes their email address or when a user is created with an email address.