In [None]:
import os
import re
from dotenv import load_dotenv
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake
from langchain.embeddings.openai import OpenAIEmbeddings
import deeplake


# Load environment variables from .env file
load_dotenv()


def is_code_file(filename):
    # Add file extensions of the code files you want to process
    code_file_extensions = ['.sol']
    return any(filename.endswith(ext) for ext in code_file_extensions)


def remove_comments(text):
    # Remove single-line and multi-line comments
    # Adjust the regex according to the language of the code files
    single_line_comment = r'\/\/[^\n]*'
    multi_line_comment = r'\/\*[\s\S]*?\*\/'
    return re.sub(f"{single_line_comment}|{multi_line_comment}", "", text)


def remove_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()


def remove_string_literals(text):
    single_quoted_string = r"'([^'\\]|\\.)*'"
    double_quoted_string = r'"([^"\\]|\\.)*"'
    return re.sub(f"{single_quoted_string}|{double_quoted_string}", "", text)


def create_and_index_deeplake_dataset(embeddings, deeplake_path, root_dir):
    docs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(docs)

    db = DeepLake.from_documents(texts, embeddings, dataset_path=deeplake_path)

    return db


# Usage example
embeddings = OpenAIEmbeddings()
deeplake_path = 'hub://kevinpark/ds1'
# root_dir = './clone-nayms'
# root_dir = './clone-nayms/test/'
# root_dir = './clone-nayms/src/diamonds/nayms/facets'
root_dir = './clone-nayms/src/erc20'

db = create_and_index_deeplake_dataset(embeddings, deeplake_path, root_dir)
