In [2]:
import os
import json
import utils
import shutil
import requests
import tempfile
import weaviate
import subprocess

from pathlib import Path
from langchain_core.documents import Document
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

In [3]:
language_map = {
    ".py": "python",
    ".js": "js",
    ".ts": "ts",
    ".java": "java",
    ".cpp": "cpp",
    ".c": "c",
    ".h": "c",
    ".hpp": "cpp",
    ".html": "html",
    ".css": "css",
    ".json": "json",
    ".yaml": "yaml",
    ".yml": "yaml",
    ".md": "markdown",
    ".txt": "text",
}

language_splitters = {}
for ln in Language:
    try:
        language_splitters[ln.value] = RecursiveCharacterTextSplitter.from_language(ln)
    except Exception as e:
        print(f"Exception occurred with language {ln.value}: {str(e)}")

Exception occurred with language perl: Language Language.PERL is not implemented yet!


In [4]:
client = weaviate.connect_to_local()
db_index = "codes"
reset_db = False

if reset_db:
    client.collections.delete(db_index)
    collection = client.collections.create(db_index, description="Code snippets with NL descriptions")
else:
    collection = client.collections.get(db_index)


            Consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


### 먼저 zip파일 내 모든 파일을 읽어 langchain Document 객체화

In [5]:
temp_dir = tempfile.mkdtemp()
try: 
    subprocess.run(["unzip", "-q", "code_snippets/Flowise-main.zip", "-d", temp_dir])
except Exception as e:
    shutil.rmtree(temp_dir)
    raise ValueError(f"Cannot unzip file: {e}")

try:
    file_paths = [
        os.path.join(root, file)
        for root, _, files in os.walk(temp_dir)
        for file in files
    ]

    all_docs = []

    # 파일 읽기 병렬 처리
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = {
            executor.submit(utils.load_file, file_path): file_path for file_path in file_paths
        }

        for fut in as_completed(futures):
            docs = fut.result()
            if docs: 
                all_docs.extend(docs)

except Exception as e:
    raise Exception(f"Error: {e}")

finally:
    shutil.rmtree(temp_dir)

### 1. 길이 기반 청킹(기본)

In [None]:
text_splitter = RecursiveCharacterTextSplitter()

# Document 객체를 chunk화
# print(all_docs[0])
chunks = text_splitter.split_documents(all_docs)
chunks = [chunk for chunk in chunks if chunk.page_content]

chunks_for_weaviate = []
for chunk in chunks:
    chunks_for_weaviate.append(
        weaviate.classes.data.DataObject(
            properties={
                "text": chunk.page_content,
                "source": chunk.metadata.get("source", "unknown"),
                "language": chunk.metadata.get("language", "unknown"),
                "file_type": chunk.metadata.get("filetype", "unknown")
            },
            vector=utils.vectorize(chunk.page_content)
        ))

    if len(chunks_for_weaviate) % 1000 == 0:
        collection.data.insert_many(chunks_for_weaviate)
        chunks_for_weaviate = []

collection.data.insert_many(chunks_for_weaviate)

BatchObjectReturn(_all_responses=[UUID('6f9de768-8ef2-4716-8012-9132ae9fcea4'), UUID('19af7505-b41b-4151-9a56-b2d7fba07511'), UUID('b569545d-eef3-4673-9cb0-491848b36ef4'), UUID('93c4531a-f1a1-4ad4-ae29-adc7c36851ac'), UUID('42cb8b7f-1828-4349-8c92-cc3c5bc54084'), UUID('96a38345-1efd-4574-a9d3-6f39586355a6'), UUID('9382b33c-4d38-44a4-aca9-b53612c04f3b'), UUID('dc44c76e-1064-4488-bd63-25201412e5cc'), UUID('6c2e99e4-e68d-40fb-a435-58a25ac5d12a'), UUID('913af295-0b38-4fab-9ea5-e5e9e3ffd8bd'), UUID('450302f5-1d8a-433d-b6bb-831d99b7a4c0'), UUID('0c135997-d53b-4bd7-b70e-70141e4733eb'), UUID('d4bf52be-3712-4b03-82a4-80397d4f2fce'), UUID('a72d856f-a6ab-49c1-9877-15f4c21c7633'), UUID('00213f50-6351-48c1-944c-cd28a05de445'), UUID('f93abc3b-0df6-4d92-84d3-fc26951067cf'), UUID('64588252-d367-4662-affb-c98acfcff547'), UUID('d19bf5e5-74e7-4ba5-86a6-3e5af5bb94d8'), UUID('58b19fad-a8c4-42fd-b758-5d76820938a5'), UUID('03cd0d62-56e7-47a1-b75b-14ca4244b5dd'), UUID('866b7675-52a8-4ad5-8aa3-9138dc7adbb6'), 

In [None]:
query = "docker build"
query_vector = utils.vectorize(query)
search_out = collection.query.hybrid(query=query, vector=query_vector)
print(search_out.objects[0].properties["text"])

In [40]:
chunks[0].page_content

'# Build local monorepo image\n# docker build --no-cache -t  flowise .\n\n# Run image\n# docker run -d -p 3000:3000 flowise\n\nFROM node:20-alpine\nRUN apk add --update libc6-compat python3 make g++\n# needed for pdfjs-dist\nRUN apk add --no-cache build-base cairo-dev pango-dev\n\n# Install Chromium\nRUN apk add --no-cache chromium\n\n# Install curl for container-level health checks\n# Fixes: https://github.com/FlowiseAI/Flowise/issues/4126\nRUN apk add --no-cache curl\n\n#install PNPM globaly\nRUN npm install -g pnpm\n\nENV PUPPETEER_SKIP_DOWNLOAD=true\nENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser\n\nENV NODE_OPTIONS=--max-old-space-size=8192\n\nWORKDIR /usr/src\n\n# Copy app source\nCOPY . .\n\nRUN pnpm install\n\nRUN pnpm build\n\nEXPOSE 3000\n\nCMD [ "pnpm", "start" ]'

### 2. language-specific한 청킹

In [9]:
codesplit_db_index = "codesplit"
collection = client.collections.create(codesplit_db_index, description="Code snippets splitted by language")

In [None]:
chunks_for_weaviate = []
not_supported = 0
text_splitter = RecursiveCharacterTextSplitter()

for doc in all_docs:
    # print(doc)
    doc_language = doc.metadata["language"]

    if doc_language == "javascript": doc_language = "js"
    if doc_language == "typescript": doc_language = "ts"

    if doc_language in ["json", "text", "unknown", "yaml", "css"]: # json 등 일반 텍스트에 해당하면 -> 일단은 단순 길이 기반 청킹 수행
        chunks = text_splitter.create_documents([doc.page_content])

    elif doc_language in ["python", "js", "ts", "html", "c", "cpp", "csharp", "markdown"]: # 코드에 해당하는 경우 -> 코드 기반 청킹 수행
        sp_splitter = language_splitters[doc_language] # 해당하는 코드의 splitter를 가져옴
        chunks = sp_splitter.create_documents([doc.page_content])

    else:
        print(doc_language)
        chunks = []

    for chunk in chunks:
        chunks_for_weaviate.append(
        weaviate.classes.data.DataObject(
            properties={
                "text": chunk.page_content,
                "source": chunk.metadata.get("source", "unknown"),
                "language": chunk.metadata.get("language", "unknown"),
                "file_type": chunk.metadata.get("filetype", "unknown")
            },
            vector=utils.vectorize(chunk.page_content)
        ))

    if len(chunks_for_weaviate) and len(chunks_for_weaviate) % 1000 == 0:
        collection.data.insert_many(chunks_for_weaviate)
        chunks_for_weaviate = []

collection.data.insert_many(chunks_for_weaviate)

### 모델 호출: code chunk에 대한 description을 생성

In [None]:
text_splitter = RecursiveCharacterTextSplitter()

# Document 객체를 chunk화
# print(all_docs[0])
chunks = text_splitter.split_documents(all_docs)
chunks = [chunk for chunk in chunks if chunk.page_content]

desc_aug_chunks = []
system_prompt = "You are a code analyzer. Read the given code and write a description of it."
for chunk in chunks[:10]:
    model_out = utils.call_llm(system_prompt, chunk.page_content)
    description = model_out["content"]

    desc_aug_chunks.append(f"""### {description}\n{chunk.page_content}""")

print(desc_aug_chunks)