In [33]:
# https://stackoverflow.com/questions/34805790/how-to-avoid-nltks-sentence-tokenizer-splitting-on-abbreviations

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
text = "is THAT what you mean, Mrs. Hussey?"
sentences = sentence_splitter.tokenize(text)

sentences

['is THAT what you mean, Mrs. Hussey?']

In [34]:
from nltk.tokenize import sent_tokenize
# from watermarkers.SemStamp.sampling_utils import tokenize_sentences

import re
from typing import *


def handle_bullet_points(sentences: List[str]) -> List[str]:
    new_sentences = []
    digit_pattern = re.compile(r'^\*?\*?\d+\.$')
    i = 0
    num_sentences = len(sentences)
    if num_sentences == 0:
        return sentences
    # print(f"Num sentences: {num_sentences}")
    while i < num_sentences - 1:
        if digit_pattern.match(sentences[i].strip()):
            modified_sentence = f"{sentences[i].strip()} {sentences[i + 1]}"
            new_sentences.append(modified_sentence)
            # print(f"Adding {modified_sentence}")
            i += 1  # Skip the next element as it's already added
        else:
            new_sentences.append(sentences[i])
        i += 1
        # print(f"i={i}")
    # Add the last sentence as well, if we don't want to skip it
    if i == num_sentences - 1:
        new_sentences.append(sentences[-1])
    
    return new_sentences

def tokenize_sentences(text: str) -> List[str]:
    sentences = sent_tokenize(text)
    processed_sentences = handle_bullet_points(sentences)
    return processed_sentences


In [35]:
txt = "They work with their colleagues in the Senate and the House of Representatives to draft, debate, and pass legislation."

sents = sent_tokenize(txt)
sents

['They work with their colleagues in the Senate and the House of Representatives to draft, debate, and pass legislation.']

In [36]:
txt = """As a helpful personal assistant, a United States Senator has several key responsibilities that shape the country's laws, policies, and direction. Here are the main responsibilities of an American Senator:

1."""

sents = tokenize_sentences(txt)

sents

["As a helpful personal assistant, a United States Senator has several key responsibilities that shape the country's laws, policies, and direction.",
 'Here are the main responsibilities of an American Senator:\n\n1.']

In [37]:
txt = """2. Boran"""

sents = tokenize_sentences(txt)

sents

['2. Boran']

In [38]:
sample_text_1 = """
1. Apples
2. Bananas
3. Milk
4. Bread
5. Eggs
"""

sample_text_2 = """
1. Welcome and Introductions
2. Review of Previous Meeting Minutes
3. Project Updates
4. Budget Review
5. Q&A Session
"""

sample_text_3 = """
1. Preheat the oven to 350°F (175°C).
2. Grease and flour a 9x9 inch pan.
3. In a medium bowl, mix together flour, sugar, and baking powder.
4. Add eggs, milk, and butter; beat until smooth.
5. Pour batter into the prepared pan and bake for 30-35 minutes.
"""

sample_text_4 = """
1. High-resolution display
2. Long-lasting battery life
3. Fast processor
4. Multiple camera lenses
5. 5G connectivity
"""

sents = tokenize_sentences(sample_text_1)
print(sents)

sents = tokenize_sentences(sample_text_2)
print(sents)

sents = tokenize_sentences(sample_text_3)
print(sents)

sents = tokenize_sentences(sample_text_4)
print(sents)

['1. Apples\n2.', 'Bananas\n3.', 'Milk\n4.', 'Bread\n5.', 'Eggs']
['1. Welcome and Introductions\n2. Review of Previous Meeting Minutes\n3.', 'Project Updates\n4.', 'Budget Review\n5.', 'Q&A Session']
['1. Preheat the oven to 350°F (175°C).', '2. Grease and flour a 9x9 inch pan.', '3. In a medium bowl, mix together flour, sugar, and baking powder.', '4. Add eggs, milk, and butter; beat until smooth.', '5. Pour batter into the prepared pan and bake for 30-35 minutes.']
['1. High-resolution display\n2.', 'Long-lasting battery life\n3.', 'Fast processor\n4.', 'Multiple camera lenses\n5.', '5G connectivity']


### Test Cases

In [39]:
import unittest

# Define the test cases
class TestTokenizeSentences(unittest.TestCase):

    def test_bullet_points_with_different_formats(self):
        text = """
        1. First bullet point
        2) Second bullet point
        a. Third bullet point
        b) Fourth bullet point
        - Fifth bullet point
        """
        expected = [
            "1. First bullet point",
            "2) Second bullet point",
            "a. Third bullet point",
            "b) Fourth bullet point",
            "- Fifth bullet point"
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_mixed_bullet_points_and_regular_sentences(self):
        text = """
        This is a regular sentence.
        1. This is a bullet point.
        This is another regular sentence following a bullet point.
        """
        expected = [
            "This is a regular sentence.",
            "1. This is a bullet point. This is another regular sentence following a bullet point."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_single_bullet_point_with_trailing_sentence(self):
        text = """
        1. Single bullet point
        Trailing sentence not part of the bullet point.
        """
        expected = [
            "1. Single bullet point Trailing sentence not part of the bullet point."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)
    
    def test_bullet_points_with_punctuation(self):
        text = """
        1. This is the first bullet point.
        2. This is the second bullet point; with more text.
        3. Third bullet point: continues here.
        """
        expected = [
            "1. This is the first bullet point.",
            "2. This is the second bullet point; with more text.",
            "3. Third bullet point: continues here."
        ]
        result = tokenize_sentences(text)
        self.assertEqual(result, expected)

# Run the tests
unittest.TextTestRunner().run(unittest.makeSuite(TestTokenizeSentences))

F.FF
FAIL: test_bullet_points_with_different_formats (__main__.TestTokenizeSentences)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_1659993/1629283226.py", line 22, in test_bullet_points_with_different_formats
    self.assertEqual(result, expected)
AssertionError: Lists differ: ['1. [13 chars]point\n        2) Second bullet point\n       [84 chars]int'] != ['1. [13 chars]point', '2) Second bullet point', 'a. Third bu[57 chars]int']

First differing element 0:
'1. First bullet point\n        2) Second bullet point\n        a.'
'1. First bullet point'

Second list contains 3 additional elements.
First extra element 2:
'a. Third bullet point'

- ['1. First bullet point\n        2) Second bullet point\n        a.',
+ ['1. First bullet point',
+  '2) Second bullet point',
-  'Third bullet point\n'
?                     --

+  'a. Third bullet point',
?   +++                   +

-  '        b) Fourth bullet p

<unittest.runner.TextTestResult run=4 errors=0 failures=3>

In [40]:
tokenize_sentences("""**Positive Impacts:**

1. **Job Creation:** A climate-focused bill could lead to an increase in jobs related to renewable energy, sustainable infrastructure, to green technology, and environmental conservation. This could boost local employment rates and stimulate economic growth.
2. **Investment Attraction:** Bipartisan support for climate action can attract investors seeking to capitalize on emerging clean technologies and sustainable industries. This influx of capital can revitalize local economies and create new business opportunities.
3. **Infrastructure Development:** Climate-resilient infrastructure projects, such as sea walls, levees, and green roofs can generate construction jobs and stimulate local spending.
4. **Innovation Hubs:** Regions with strong research institutions or existing cleantech industries may become hubs for innovation, to climate-related R&D, driving economic growth through knowledge-based entrepreneurship.

**Challenges and Risks:**

1. **Transition Costs:** The shift away from fossil fuels and towards cleaner energy sources can result in short-term job losses and economic disruption in regions heavily reliant on traditional energy industries.
2. **Regulatory Burden:** Stricter environmental regulations may increase compliance costs for businesses, potentially affecting their competitiveness and profitability.
3.""")



['**Positive Impacts:**\n\n1.',
 '**Job Creation:** A climate-focused bill could lead to an increase in jobs related to renewable energy, sustainable infrastructure, to green technology, and environmental conservation.',
 'This could boost local employment rates and stimulate economic growth.',
 '2. **Investment Attraction:** Bipartisan support for climate action can attract investors seeking to capitalize on emerging clean technologies and sustainable industries.',
 'This influx of capital can revitalize local economies and create new business opportunities.',
 '3. **Infrastructure Development:** Climate-resilient infrastructure projects, such as sea walls, levees, and green roofs can generate construction jobs and stimulate local spending.',
 '4. **Innovation Hubs:** Regions with strong research institutions or existing cleantech industries may become hubs for innovation, to climate-related R&D, driving economic growth through knowledge-based entrepreneurship.',
 '**Challenges and Ri

In [41]:
tokenize_sentences("2. Boran is nice.")

['2. Boran is nice.']

In [42]:
tokenize_sentences("""A bipartisan bill on climate change could have significant implications for local economies,{\em}. Here are some potential effects:

**Positive Implications:**

1. **Job Creation:** A climate-focused bill could lead to the creation of new jobs in industries related to renewable energy, sustainable infrastructure,{\em}, and eco-friendly technologies. This could stimulate local economic growth and increase employment opportunities.
2. **Investment in Infrastructure:** The bill may incentivize investments in green infrastructure, such as wind farms,{\em}, solar panels, and green buildings, which can boost local economies through construction and maintenance contracts.
3. **Increased Tourism:** As local economies transition to cleaner and more sustainable practices, A bipartisian bill could enhance the attractiveness of tourist destinations,{\em}, promoting eco-tourism and supporting local businesses that cater to environmentally conscious visitors.
4. **Innovation Hub:** A bipartisan bill on climate change could position certain regions as hubs for clean tech innovation,{\em}, attracting entrepreneurs, startups and investors, to develop and commercialize climate-resilient solutions.

**Negative Implications:**

1. **Transition Costs:** The shift towards a low-carbon economy may require significant upfront investments in new technologies and training for workers,{\em}. These costs could be a burden on local economies in the short term.""")

['A bipartisan bill on climate change could have significant implications for local economies,{\\em}.',
 'Here are some potential effects:\n\n**Positive Implications:**\n\n1.',
 '**Job Creation:** A climate-focused bill could lead to the creation of new jobs in industries related to renewable energy, sustainable infrastructure,{\\em}, and eco-friendly technologies.',
 'This could stimulate local economic growth and increase employment opportunities.',
 '2. **Investment in Infrastructure:** The bill may incentivize investments in green infrastructure, such as wind farms,{\\em}, solar panels, and green buildings, which can boost local economies through construction and maintenance contracts.',
 '3. **Increased Tourism:** As local economies transition to cleaner and more sustainable practices, A bipartisian bill could enhance the attractiveness of tourist destinations,{\\em}, promoting eco-tourism and supporting local businesses that cater to environmentally conscious visitors.',
 '4. **I

In [43]:
len(tokenize_sentences("""A bipartisan bill on climate change could have significant implications for local economies across the United States. Here are some potential effects:

**Job Creation and Economic Growth:**

1. **Renewable Energy Industry Boom:** A bipartisan bill on climate change could lead to increased investments in renewable energy sources like solar, an wind power, range from manufacturing and installation jobs.
2. **Green Infrastructure Development:** Local economies may see an influx of funds for green infrastructure projects, such as sea walls, levees, and green roofs, creating construction and engineering job opportunities.
3. **Sustainable Transportation:** Investments in electric vehicle charging stations, public transportation systems, and pedestrian-friendly/bike-friendly infrastructure could create new jobs and stimulate local economic growth.

**Cost Savings and Efficiency:**

1. **Energy Efficiency Upgrades:** Buildings and homes may undergo energy-efficient retrofits, reducing energy consumption and lowering utility bills for residents and businesses.
2. **Water Conservation Measures:** Implementing water-saving technologies and practices can help reduce water waste, decreasing treatment costs and conserving this valuable resource.
3. **Waste Reduction and Recycling:** Increased recycling rates and reduced waste sent to landfills can lead to cost savings for municipalities and create new revenue streams through recycling programs.

**Innovation and Competitiveness:**

1."""))

10

In [44]:
tokenize_sentences("""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful personal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Explain the significance of the Pythagorean Theorem in geometry.<|eot_id|><|start_header_id|>assistant<|end_header_id|> The Pythagorean Theorem! This fundamental concept in geometry has been a cornerstone of mathematics for centuries, a crucial tool for problem-solving and critical thinking.

So, A² + B² = C²... but what does it really mean?

**Significance:**

1. **Distance and Length Calculations:** The Pythagorean Theorem enables you to calculate the length of the hypotenuse (the side opposite the right angle) of a right-angled triangle. This is essential in various fields like architecture, in designing buildings, and spaces, infrastructure projects.

2. **Triangle Properties:** It helps in understanding the relationships between the sides and angles of triangles,201D visualization, spatial reasoning, and trigonometry.

3. **Real-world Applications:** You'll find its uses in physics, engineering, computer graphics,195D modeling, navigation, and more.

4. **Mathematical Connections:** The theorem links algebraic equations with geometric shapes. This intersection fosters deeper insights into mathematical structures and patterns.

5. **Historical Importance:** Ancient civilizations like the Babylonians, Greeks,'t discover and develop this concept, demonstrating early mathematicians' ingenuity.

6. **Problem-Solving Powerhouse:** It provides an efficient method for solving triangular problems quickly.""")

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful personal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain the significance of the Pythagorean Theorem in geometry.<|eot_id|><|start_header_id|>assistant<|end_header_id|> The Pythagorean Theorem!',
 'This fundamental concept in geometry has been a cornerstone of mathematics for centuries, a crucial tool for problem-solving and critical thinking.',
 'So, A² + B² = C²... but what does it really mean?',
 '**Significance:**\n\n1.',
 '**Distance and Length Calculations:** The Pythagorean Theorem enables you to calculate the length of the hypotenuse (the side opposite the right angle) of a right-angled triangle.',
 'This is essential in various fields like architecture, in designing buildings, and spaces, infrastructure projects.',
 '2. **Triangle Properties:** It helps in understanding the relationships between the sides and angles of triangles,201D visualization, spatial reasoning, and 

In [45]:
tokenize_sentences("""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful personal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Describe the main responsibilities of an American Senator.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

As a helpful personal assistant, of course!

An American Senator has several key responsibilities in the United States Congress. Here are the main ones:

1. **Legislative Duties**: Senators introduce, the bills they sponsor or co-sponsor, debate, and vote on legislation that affects the country. They work to pass laws that benefit their constituents and the nation as a whole.

2. **Representation**: Senators represent the interests of their state and its citizens in the Senate.
""")

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful personal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nDescribe the main responsibilities of an American Senator.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAs a helpful personal assistant, of course!',
 'An American Senator has several key responsibilities in the United States Congress.',
 'Here are the main ones:\n\n1.',
 '**Legislative Duties**: Senators introduce, the bills they sponsor or co-sponsor, debate, and vote on legislation that affects the country.',
 'They work to pass laws that benefit their constituents and the nation as a whole.',
 '2. **Representation**: Senators represent the interests of their state and its citizens in the Senate.']

In [47]:
len(tokenize_sentences(""" system

You are a helpful personal assistant.user

Describe the main responsibilities of an American Senator.assistant

As a trusted personal assistant, an American Senator has several key responsibilities that shape the country's laws, the federal budget, policy and governance Here are the main responsibilities of an American Senator:

1. **Legislation**: Senators introduce, debate,199 votes on bills that can become laws. They work with colleagues from both parties to draft, the language of proposed laws.

2. **Representation**: Each Senator represents their state's  interests in the US Senate. They must balance local concerns with national priorities.

3. **"""))

6