In [90]:
# opens words.txt, reads the entrie content as a single string 
# and uses whitespace to split all words
words = open('words.txt', 'r').read().split()

# function that returns true if w is in the list of words else false
def is_word(w):
    return w in words

# function that takes string s and optional argument nobreak (list) with default value []
def segment(s, nobreak = []):
    'segment s into words avoiding breaks at positions in nobreak'
    # calls the function segment_r to perform segmentation starting at position 0
    split = segment_r(s, 0, nobreak)
    # if the function segment_r segments a string i.e split != None
    if split != None:
        # join all the segmented strings with a whitespace
        return ' '.join(split)

# function that segments the given string s from position i
def segment_r(s, i, nobreak):
    'segment s[i:] into words avoiding breaks at nobreak positions'
    # base case: if it reaches here, it means that the segmentation is complete
    # i is equal to the length of the string and there is no more string to segment
    if i == len(s):
        return []
    
    # loops from position starting at i+1 so that there is at least a single charater to compare
    # against our words dictionary i.e is_word(s[i:j]) initially where i = 0 and j = i+1 = 1
    for j in range(i+1, len(s)+1):
        # checks if the current split position is in nobreak list
        # if it is, skip this split position i.e. skip the code below and move to another iteration
        if j in nobreak:
            continue
            
        # if we can split in this position i.e current position j not in nobreak list
        # and if we find a valid word
        if is_word(s[i:j]):
            # recursively try to segment the rest of the string after finding the first string
            split = segment_r(s, j, nobreak)
            # if we find any valid segmentation for the rest of the string, 
            # we append it to the base case empty array []
            if split != None:
                # after the base case ends, initially, the split is [], 
                # which is then added to [s[i:j]] i.e. last segmentation
                # all the way to the first segmentation
                return [s[i:j]] + split
                
    # if no valid segmentation is found, return
    return

In [91]:
print(segment("sundayissunny"))
print(segment("sundayissunny", [3]))
print(segment("sundayissunny", [3, 11]))

sun day is sun ny
sunday is sun ny
sunday is sunny


In [92]:
print(segment("whatistruth"))
print(segment("whatistruth", [3, 7]))
print(segment("whatistruth", [7]))
print(segment("whatistruth", [5]))
print(segment("whatistruth", [3, 5]))

wha ti st ruth
what is truth
wha tis truth
wha tis truth
what is truth
