In [1]:
from itertools import chain
import re

In [2]:
def hsplit(text, sep, group=True, attach=True):
    """Flexible string splitting that retains the delimiter rather, unlike
    the built-in str.split() method.
    
    Parameters
    -----------
    text: str
        The input text to be split.
    sep: str
        The delimiter to be split on.
    group: bool
        Specifies whether to group consecutive delimiters together (True),
        or to separate them (False).
    attach: bool
        Specifies whether to attach the delimiter to the string that preceeds 
        it (True), or to detach it so it appears in the output list as its own 
        item (False).
    
    Returns
    --------
    list[str]
    
    Examples
    ---------
    text = "Score -- Giants win 6-5"
    sep = '-'
    
    # Case 0.1: Delimiters are grouped together and attached to the preceding 
    word.
    >> hsplit(text, sep, group=True, attach=True)
    >> ['Score --', ' Giants win 6-', '5']
    
    # Case 0.2: Delimiters are grouped together but are detached from the 
    preceding word, instead appearing as their own item in the output list.
    >> hsplit(text, sep, group=True, attach=False)
    >> ['Score ', '--', ' Giants win 6', '-', '5']
    
    Case 1.1: Delimiters are retained and attached to the preceding string. 
    If the delimiter occurs multiple times consecutively, only the first 
    occurrence is attached, and the rest appear as individual items in the 
    output list.
    >> hsplit(text, sep, group=False, attach=True)
    >> ['Score -', '-', ' Giants win 6-', '5']
    
    # Case 1.2: Delimiters are retained but are detached from the preceding
    string.
    It appears as its own item in the output list.
    >> hsplit(text, sep, group=False, attach=False)
    >> ['Score ', '-', '-', ' Giants win 6', '-', '5']
    """
    sep_re = re.escape(sep)
    regex = f'[^{sep_re}]*{sep_re}*'        
    
    ##########################################################################
    # Case 0: Consecutive delimiters are grouped together.
    ##########################################################################
    if group:
        # Subcase 0.1
        if attach:
            return [word for word in re.findall(regex, text)][:-1]
        
        # Subcase 0.2
        else:
            return [word for word in re.split(f'({sep_re}+)', text) if word]
    
    ##########################################################################
    # Case 1: Consecutive delimiters are NOT grouped together.
    ##########################################################################
    words = text.split(sep)

    # Subcase 1.1
    if attach:
        return [word for word in re.findall(regex[:-1]+'?', text) if word]
    
    # Subcase 1.2
    return [word for word in chain(*zip(words, [sep]*len(words))) if word][:-1]

In [3]:
text1 = 'I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.'
text2 = '*I went*to *the store and* sat next to the window*'
text3 = '**I went*to **the store and*** sat next to the window**'

texts = [text1, text2, text3]
sep = '*'

In [4]:
def test_output(tests, sep, **kwargs):
    for test in tests:
        print(test)
        print('STANDARD:', test.split(sep))
        print('H:', hsplit(test, sep, **kwargs), end='\n\n')

In [5]:
for text in texts:
    print(text)

I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.
*I went*to *the store and* sat next to the window*
**I went*to **the store and*** sat next to the window**


In [6]:
test_output(texts, sep, group=True, attach=True)

I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.
STANDARD: ['I', 'went to the store', ' yesterday after', ' work', 'to', 'see', ' the walrus and it walked slowly', ' over to me', ' and sat.']
H: ['I*', 'went to the store*', ' yesterday after*', ' work*', 'to*', 'see*', ' the walrus and it walked slowly*', ' over to me*', ' and sat.']

*I went*to *the store and* sat next to the window*
STANDARD: ['', 'I went', 'to ', 'the store and', ' sat next to the window', '']
H: ['*', 'I went*', 'to *', 'the store and*', ' sat next to the window*']

**I went*to **the store and*** sat next to the window**
STANDARD: ['', '', 'I went', 'to ', '', 'the store and', '', '', ' sat next to the window', '', '']
H: ['**', 'I went*', 'to **', 'the store and***', ' sat next to the window**']



In [7]:
test_output(texts, sep, group=True, attach=False)

I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.
STANDARD: ['I', 'went to the store', ' yesterday after', ' work', 'to', 'see', ' the walrus and it walked slowly', ' over to me', ' and sat.']
H: ['I', '*', 'went to the store', '*', ' yesterday after', '*', ' work', '*', 'to', '*', 'see', '*', ' the walrus and it walked slowly', '*', ' over to me', '*', ' and sat.']

*I went*to *the store and* sat next to the window*
STANDARD: ['', 'I went', 'to ', 'the store and', ' sat next to the window', '']
H: ['*', 'I went', '*', 'to ', '*', 'the store and', '*', ' sat next to the window', '*']

**I went*to **the store and*** sat next to the window**
STANDARD: ['', '', 'I went', 'to ', '', 'the store and', '', '', ' sat next to the window', '', '']
H: ['**', 'I went', '*', 'to ', '**', 'the store and', '***', ' sat next to the window', '**']



In [8]:
test_output(texts, sep, group=False, attach=True)

I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.
STANDARD: ['I', 'went to the store', ' yesterday after', ' work', 'to', 'see', ' the walrus and it walked slowly', ' over to me', ' and sat.']
H: ['I*', 'went to the store*', ' yesterday after*', ' work*', 'to*', 'see*', ' the walrus and it walked slowly*', ' over to me*', ' and sat.']

*I went*to *the store and* sat next to the window*
STANDARD: ['', 'I went', 'to ', 'the store and', ' sat next to the window', '']
H: ['*', 'I went*', 'to *', 'the store and*', ' sat next to the window*']

**I went*to **the store and*** sat next to the window**
STANDARD: ['', '', 'I went', 'to ', '', 'the store and', '', '', ' sat next to the window', '', '']
H: ['*', '*', 'I went*', 'to *', '*', 'the store and*', '*', '*', ' sat next to the window*', '*']



In [9]:
test_output(texts, sep, group=False, attach=False)

I*went to the store* yesterday after* work*to*see* the walrus and it walked slowly* over to me* and sat.
STANDARD: ['I', 'went to the store', ' yesterday after', ' work', 'to', 'see', ' the walrus and it walked slowly', ' over to me', ' and sat.']
H: ['I', '*', 'went to the store', '*', ' yesterday after', '*', ' work', '*', 'to', '*', 'see', '*', ' the walrus and it walked slowly', '*', ' over to me', '*', ' and sat.']

*I went*to *the store and* sat next to the window*
STANDARD: ['', 'I went', 'to ', 'the store and', ' sat next to the window', '']
H: ['*', 'I went', '*', 'to ', '*', 'the store and', '*', ' sat next to the window', '*']

**I went*to **the store and*** sat next to the window**
STANDARD: ['', '', 'I went', 'to ', '', 'the store and', '', '', ' sat next to the window', '', '']
H: ['*', '*', 'I went', '*', 'to ', '*', '*', 'the store and', '*', '*', '*', ' sat next to the window', '*', '*']



In [10]:
for text in texts:
    print(text.split(sep), end='\n\n')

['I', 'went to the store', ' yesterday after', ' work', 'to', 'see', ' the walrus and it walked slowly', ' over to me', ' and sat.']

['', 'I went', 'to ', 'the store and', ' sat next to the window', '']

['', '', 'I went', 'to ', '', 'the store and', '', '', ' sat next to the window', '', '']



In [11]:
with open('../data/lines100.txt', 'r') as f:
    lines100 = f.read()
    
with open('../data/lines5000.txt', 'r') as f:
    lines5000 = f.read()

In [12]:
tmp = hsplit(lines5000, '.', group=True, attach=True)

In [13]:
tmp = hsplit(lines5000, '.', group=False, attach=True)

In [14]:
tmp = lines5000.split('.')

In [15]:
def hpartition(text, sep):
    return text.partition(sep)