# Python Code

`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.

1. How the text is split: by list of python specific separators
2. How the chunk size is measured: by number of characters

In [1]:
from langchain.text_splitter import PythonCodeTextSplitter

In [2]:
python_text = """
class Foo:

    def bar():
    
    
def foo():

def testing_func():

def bar():
"""
python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)

In [3]:
docs = python_splitter.create_documents([python_text])

['\n', 'c', 'l', 'a', 's', 's', ' ', 'Foo:\n', '\n', ' ', ' ', ' ', ' ', 'def ', 'ba', 'r():\n', ' ', ' ', ' ', ' ', '\n', ' ', ' ', ' ', ' ', '\n', 'def ', 'foo():\n', '\n', 'def ', 'tes', 'ting_func', '():\n', '\n', 'def ', 'ba', 'r():\n', '']

class 


In [5]:
import re
def _split_text(text: str, separator: str):
    # Now that we have the separator, split the text
    if separator:
        # The parentheses in the pattern keep the delimiters in the result.
        _splits = re.split(f'([{separator}])', text)
        splits = [_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)]
        if len(_splits) % 2 == 1:
            splits += _splits[-1:]
    else:
        splits = list(text)
    return splits

In [6]:
_split_text(python_text, "class")

['\nc',
 'l',
 'a',
 's',
 's',
 ' Foo:\n\n    def ba',
 'r():\n    \n    \ndef foo():\n\ndef tes',
 'ting_func',
 '():\n\ndef ba',
 'r():\n']

In [4]:
docs

[Document(page_content='class Foo:\n\n    def bar():', metadata={}),
 Document(page_content='def foo():\n\ndef tes', metadata={}),
 Document(page_content='ting_func():\n\ndef bar():', metadata={})]

In [3]:
python_splitter.split_text(python_text)

['Foo:\n\n    def bar():', 'foo():\n\ndef testing_func():', 'bar():']

In [8]:
.split(" ", 1)

['foo', 'bar']

In [10]:
d = " "
s =  [e+d for e in "foo bar".split(d) if e]

In [23]:
def _split_text(text: str, separator: str, keep_separator: bool = False):
    # Now that we have the separator, split the text
    if separator:
        if keep_separator:
            # The parentheses in the pattern keep the delimiters in the result.
            _splits = re.split(f'([{separator}])', text)
            splits = [_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)]
            if len(_splits) % 2 == 1:
                splits += _splits[-1:]
        else:
            splits = text.split(separator)
    else:
        splits = list(text)
    return splits

0

In [28]:
import re

text = " This is an example. Another example! And a question ?"

# The parentheses in the pattern keep the delimiters in the result.
splits = re.split(f'([{" "}])', text)

print(splits)

splits = [splits[i] + splits[i + 1] for i in range(0, len(splits)-1, 2)]


['', ' ', 'This', ' ', 'is', ' ', 'an', ' ', 'example.', ' ', 'Another', ' ', 'example!', ' ', 'And', ' ', 'a', ' ', 'question', ' ', '?']


In [27]:
splits

[' ',
 'This ',
 'is ',
 'an ',
 'example. ',
 'Another ',
 'example! ',
 'And ',
 'a ',
 'question ',
 '? ']