In [8]:
from xml.etree import ElementTree as ET
from xml.dom import minidom

In [4]:
text = """Analysis Process: When first working with CoreFlowVis, Brian thought that starting with as much data as possible was a good idea. He queried for all events on the company’s website for a period of 6 months resulting in a dataset of more than 300K sequences and 5 million events. Unfortunately, he found that there were no frequent patterns for more than 40\% of the sequences, indicating that the sequences were very heterogeneous. For the remaining sequences, the help page appears many times as the milestone. Brian was not interested in help pages, so he decided to focus on sequences landing on a portion of the website describing a particular product.
Pattern Quality: With the sharpened focus on the input data, CoreFlowVis shows visualizations that aligned well with Brian’s knowledge. Brian had defined a funnel of important pages previously in other tools, and he was amazed that CoreFlow was able to automatically identify these pages as milestones. He commented
“this is perfect” as the visualization also showed frequent pages visited outside the funnel and after the last page in the funnel.
Insights Discovered: The icicle plot helped Brian find a new user segment that he wasn’t tracking already. By grouping all sequences by the first event in the sequence, Brian was able to isolate existing customers who were getting to the website from a link inside of the product. It turned out that 25% of the traffic was from this group of users and understanding their behavior was critical to understanding the traffic on the website.
Brian was also able to see that 25% of visitors were switching from one payment plan to another. This finding helped him confirm what the company was seeing in the sales department.
"""

In [5]:
def tokenize_text(text):
    # Split on periods and clean up tokens
    sentences = [s.strip() for s in text.split('.')]
    
    # Filter out empty strings
    sentences = list(filter(None, sentences))
    
    return sentences

In [6]:
sentences = tokenize_text(text)

In [7]:
# Print results
for i, sentence in enumerate(sentences, 1):
    print(f"Sentence {i}: {sentence}")

Sentence 1: Analysis Process: When first working with CoreFlowVis, Brian thought that starting with as much data as possible was a good idea
Sentence 2: He queried for all events on the company’s website for a period of 6 months resulting in a dataset of more than 300K sequences and 5 million events
Sentence 3: Unfortunately, he found that there were no frequent patterns for more than 40\% of the sequences, indicating that the sequences were very heterogeneous
Sentence 4: For the remaining sequences, the help page appears many times as the milestone
Sentence 5: Brian was not interested in help pages, so he decided to focus on sequences landing on a portion of the website describing a particular product
Sentence 6: Pattern Quality: With the sharpened focus on the input data, CoreFlowVis shows visualizations that aligned well with Brian’s knowledge
Sentence 7: Brian had defined a funnel of important pages previously in other tools, and he was amazed that CoreFlow was able to automaticall

In [9]:
def sentences_to_xml(sentences):
    # Create root element
    root = ET.Element("sentences")
    
    # Add each sentence as an input element
    for sentence in sentences:
        input_elem = ET.SubElement(root, "input")
        input_elem.text = sentence
    
    # Convert to string with pretty formatting
    xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")
    return xml_str

In [10]:
# Convert sentences to XML
xml_output = sentences_to_xml(sentences)

# Print or save the result
print(xml_output)

# Optionally save to file
with open('sentences.xml', 'w') as f:
    f.write(xml_output)

<?xml version="1.0" ?>
<sentences>
    <input>Analysis Process: When first working with CoreFlowVis, Brian thought that starting with as much data as possible was a good idea</input>
    <input>He queried for all events on the company’s website for a period of 6 months resulting in a dataset of more than 300K sequences and 5 million events</input>
    <input>Unfortunately, he found that there were no frequent patterns for more than 40\% of the sequences, indicating that the sequences were very heterogeneous</input>
    <input>For the remaining sequences, the help page appears many times as the milestone</input>
    <input>Brian was not interested in help pages, so he decided to focus on sequences landing on a portion of the website describing a particular product</input>
    <input>Pattern Quality: With the sharpened focus on the input data, CoreFlowVis shows visualizations that aligned well with Brian’s knowledge</input>
    <input>Brian had defined a funnel of important pages previou