# Split code

In [1]:
from gitdb.fun import chunk_size
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

In [2]:
# 지원되는 언어의 전체 목록을 가져옵니다.
[e.value for e in Language]

['cpp',
 'go',
 'java',
 'kotlin',
 'js',
 'ts',
 'php',
 'proto',
 'python',
 'rst',
 'ruby',
 'rust',
 'scala',
 'swift',
 'markdown',
 'latex',
 'html',
 'sol',
 'csharp',
 'cobol',
 'c',
 'lua',
 'perl',
 'haskell',
 'elixir',
 'powershell']

In [3]:
# 주어진 언어에 대해 사용되는 구분자를 확인
RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)

['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']

# Python

In [4]:
PYTHON_CODE = """
def hello_world():
    print("Hello, World!")

hello_world()
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=50,
    chunk_overlap=0
)

In [5]:
python_docs = python_splitter.create_documents([PYTHON_CODE])
python_docs

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'),
 Document(metadata={}, page_content='hello_world()')]

In [6]:
for doc in python_docs:
    print(doc.page_content, end="\n========================\n")

def hello_world():
    print("Hello, World!")
hello_world()


# JS

In [7]:
JS_CODE = """
function helloWorld() {
  console.log("Hello, World!");
}

helloWorld();
"""

js_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JS,
    chunk_size=50,
    chunk_overlap=0
)

js_docs = js_splitter.create_documents([JS_CODE])
js_docs

[Document(metadata={}, page_content='function helloWorld() {'),
 Document(metadata={}, page_content='console.log("Hello, World!");\n}'),
 Document(metadata={}, page_content='helloWorld();')]