In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os
import lancedb

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import LanceDB

In [None]:
aws_dir = "./eng-feed/src/aws/lib"
lambda_dir = "./eng-feed/src/aws/backend/lambda"
db_dir = "./eng-feed/src/aws/backend/db"
pages_dir = "./eng-feed/src/pages"
components_dir = "./eng-feed/src/components"

docs = []
for directory in [aws_dir, lambda_dir, db_dir, pages_dir, components_dir]:
    for dirpath, dirnames, filenames in os.walk(directory):
        for file in filenames:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)
code = text_splitter.split_documents(docs)

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
db = lancedb.connect('/tmp/lancedb')
table = db.create_table("code_files", data=[
    {"vector": embeddings.embed_query("Hello World"), "text": "Hello World", "id": "1"}
], mode="overwrite")
code_search = LanceDB.from_documents(code, embeddings, connection=table)

In [None]:
retriever = code_search.as_retriever()
retriever.search_kwargs["distance_metric"] = "cos"
retriever.search_kwargs["fetch_k"] = 100
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["k"] = 10
OpenAIModel = ChatOpenAI(model_name="gpt-3.5-turbo-0613")
qa = RetrievalQA.from_llm(llm=OpenAIModel, retriever=retriever)

In [None]:
query = "How are users notified when there is a new post?"
qa.run(query)