In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(disallowed_special=())

In [9]:
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

aws_dir = "./eng-feed/src/aws/lib"
lambda_dir = "./eng-feed/src/aws/backend/lambda"
db_dir = "./eng-feed/src/aws/backend/db"
pages_dir = "./eng-feed/src/pages"
components_dir = "./eng-feed/src/components"

docs = []
for directory in [aws_dir, lambda_dir, db_dir, pages_dir, components_dir]:
    for dirpath, dirnames, filenames in os.walk(directory):
        for file in filenames:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

    

Created a chunk of size 1321, which is longer than the specified 1000
Created a chunk of size 1455, which is longer than the specified 1000
Created a chunk of size 1833, which is longer than the specified 1000
Created a chunk of size 1066, which is longer than the specified 1000


In [10]:
from langchain.vectorstores import DeepLake

deeplake_username = "haffimazhar96"

db = DeepLake(
    dataset_path=f"hub://{deeplake_username}/code-index-llm",
    embedding=embeddings,
)
db.add_documents(texts)

Your Deep Lake dataset has been successfully created!


creating embeddings: 100%|██████████| 1/1 [00:11<00:00, 11.20s/it]

Dataset(path='hub://haffimazhar96/code-index-llm', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (91, 1)      str     None   
 metadata     json      (91, 1)      str     None   
 embedding  embedding  (91, 1536)  float32   None   
    id        text      (91, 1)      str     None   





['2decf00c-7b73-11ee-bd5d-e24bee7314b9',
 '2decf278-7b73-11ee-bd5d-e24bee7314b9',
 '2decf304-7b73-11ee-bd5d-e24bee7314b9',
 '2decf372-7b73-11ee-bd5d-e24bee7314b9',
 '2decf3d6-7b73-11ee-bd5d-e24bee7314b9',
 '2decf444-7b73-11ee-bd5d-e24bee7314b9',
 '2decf50c-7b73-11ee-bd5d-e24bee7314b9',
 '2decf570-7b73-11ee-bd5d-e24bee7314b9',
 '2decf5d4-7b73-11ee-bd5d-e24bee7314b9',
 '2decf638-7b73-11ee-bd5d-e24bee7314b9',
 '2decf6a6-7b73-11ee-bd5d-e24bee7314b9',
 '2decf70a-7b73-11ee-bd5d-e24bee7314b9',
 '2decf818-7b73-11ee-bd5d-e24bee7314b9',
 '2decf8d6-7b73-11ee-bd5d-e24bee7314b9',
 '2decf9bc-7b73-11ee-bd5d-e24bee7314b9',
 '2decfa20-7b73-11ee-bd5d-e24bee7314b9',
 '2decfb38-7b73-11ee-bd5d-e24bee7314b9',
 '2decfc6e-7b73-11ee-bd5d-e24bee7314b9',
 '2decfd9a-7b73-11ee-bd5d-e24bee7314b9',
 '2decfdfe-7b73-11ee-bd5d-e24bee7314b9',
 '2decfec6-7b73-11ee-bd5d-e24bee7314b9',
 '2decff2a-7b73-11ee-bd5d-e24bee7314b9',
 '2decff84-7b73-11ee-bd5d-e24bee7314b9',
 '2decffe8-7b73-11ee-bd5d-e24bee7314b9',
 '2ded004c-7b73-

In [11]:
db = DeepLake(
    dataset_path=f"hub://{deeplake_username}/code-index-llm",
    read_only=True,
    embedding=embeddings,
)

ResourceNotFoundException: Dataset at path hub://haffimazhar96/code-llm is scheduled for deletion.

In [31]:
retriever = db.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 10

In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model_name="gpt-3.5-turbo-0613")  # switch to 'gpt-4'
qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)

In [33]:
questions = [
    "What does this code repo do?",
    "Where is this app hosted?",
    "How do users get notified when there is a new post?",
]
chat_history = []

for question in questions:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: What does this code repo do? 

**Answer**: This code repository defines a CDK stack called "DevFeedStack" that sets up various AWS resources and configurations. The stack includes the following components:

1. Creates an SQS queue and an SNS topic.
2. Defines a Lambda function named "FetchAndInsertPosts" and schedules it to run every day at 8:00 AM.
3. Defines a Lambda function named "NotifyUsers" and schedules it to run every day at 9:00 AM (or every Friday at 9:00 AM in production environment).
4. Grants permissions to the Lambda functions to access the necessary resources, such as RDS instance and SQS queue.
5. Creates a Lambda function named "SendEmail" for sending emails and subscribes the SNS topic to it.
6. Sets up an SQS event source for the "NotifyUsers" Lambda function to process messages from the queue.
7. Creates an S3 bucket for storing assets and sets a bucket policy to allow public read access to all objects.
8. Deploys assets to the S3 bucket.
9. Export

In [35]:
new_question = [
    "Which class has the code for aws stack?"
]

for question in new_question:
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result["answer"]))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: Which class has the code for aws stack? 

**Answer**: The class that contains the code for the AWS stack is `DevFeedStack`. 

