In [8]:
import dotenv
dotenv.load_dotenv("../backend/.env")

True

In [9]:
import os
from tqdm import tqdm
from unixcoder import UnixcoderEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings

### Load code and chunk it 

In [10]:
# Load the files 
all_files = []
for root, dirs, files in os.walk("data/processed/", topdown=True):
   files = [os.path.join(root, f) for f in files if f.endswith('.py') or f.endswith('.md')]
   all_files += files

print(f'Has {len(all_files)} files, e.g.: {all_files[0]}')

# Read the files and create documents
texts = []
for file in all_files: 
    with open(file) as f:
        try:  
            file_content = f.read()
            if file_content:
                texts.append(Document(page_content=file_content, metadata={"filename": file}))
        except Exception as e:
            print(f"Error reading file {file}: {e}")

print(f"Has {len(texts)} documents, e.g. {str(texts[0])[:50]}")

Has 2370 files, e.g.: data/processed/lanarky-main/tests/test_applications.py
Has 2085 documents, e.g. page_content='import pytest\nfrom fastapi.testclie


In [11]:
text_splitter = CharacterTextSplitter(
    # language=Language.PYTHON, 
    chunk_size=1000, 
    chunk_overlap=100, 
    separator="\n\n", 
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(texts)

approx_tokens = sum([len(t.page_content) for t in chunks])

Created a chunk of size 1116, which is longer than the specified 1000
Created a chunk of size 1206, which is longer than the specified 1000
Created a chunk of size 1070, which is longer than the specified 1000
Created a chunk of size 1103, which is longer than the specified 1000
Created a chunk of size 1394, which is longer than the specified 1000
Created a chunk of size 1096, which is longer than the specified 1000
Created a chunk of size 1086, which is longer than the specified 1000
Created a chunk of size 2883, which is longer than the specified 1000
Created a chunk of size 1357, which is longer than the specified 1000
Created a chunk of size 2357, which is longer than the specified 1000
Created a chunk of size 1428, which is longer than the specified 1000
Created a chunk of size 1366, which is longer than the specified 1000
Created a chunk of size 2642, which is longer than the specified 1000
Created a chunk of size 1039, which is longer than the specified 1000
Created a chunk of s

In [12]:
print(f"Embedding cost with OpenAI: {round(approx_tokens / 1000 * 0.0001, 2)}$")
print(f"Number of chunks: {len(chunks)}")

Embedding cost with OpenAI: 0.67$
Number of chunks: 6027


In [13]:
# for i in [0, 1, 100, 101, 1000, 1001]: 
#     print(chunks[i].page_content)
#     print('-' * 80)

### Embedd it 

In [3]:
def embedd(embedding_function, chunks, persist_directory):
    db = Chroma.from_documents(chunks, embedding_function, persist_directory=persist_directory)
    return db

In [6]:
db_unix = embedd(embedding_function=UnixcoderEmbeddings(), persist_directory="./unixcoder_embeddings", chunks=chunks)

Embedding documents: 6027


100%|██████████| 6027/6027 [01:07<00:00, 89.85it/s] 


In [14]:
db_openai = embedd(embedding_function=OpenAIEmbeddings(), persist_directory="./openai_embeddings", chunks=chunks)