In [1]:
from datasets import load_dataset
import spacy
import re 
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [54]:
ds = load_dataset("roneneldan/TinyStories")
tiny_train = ds['train']
tiny_val = ds['validation']

In [59]:
tiny_train['text'][2]

'One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play?" asked the little fish. The crab looked at Fin and said, "No, I don\'t want to play. I am cold and I don\'t feel fine."\n\nFin felt sad but wanted to help the crab feel better. He swam away and thought of a plan. He remembered that the sun could make things warm. So, Fin swam to the top of the water and called to the sun, "Please, sun, help my new friend feel fine and not freeze!"\n\nThe sun heard Fin\'s call and shone its warm light on the shore. The crab started to feel better and not so cold. He saw Fin and said, "Thank you, little fish, for making me feel fine. I don\'t feel like I will freeze now. Let\'s play together!" And so, Fin and the crab played and became good friends.'

In [3]:
subjects = tiny_train["text"][0:3]

## Spacy Exploration

In [4]:
nlp_model = spacy.load("en_core_web_sm")

test = nlp_model(subjects[0])

for sentence in test.sents:
    for token in sentence:
        print(token.pos_)
    break

NUM
NOUN
PUNCT
DET
ADJ
NOUN
VERB
PROPN
VERB
DET
NOUN
ADP
PRON
NOUN
PUNCT


## Clause separator

In [52]:
class ClauseSeparator:
    def __init__(self, size = 'small'):
        if size == 'small':
            self.nlp = spacy.load("en_core_web_sm")
        elif size == 'large':
            self.nlp = spacy.load("en_core_web_trf")
        else:
            print("Choose an appropriate spaCy model. Ex install: python -m spacy download en_core_web_sm")
            self.nlp = None
    
    def clause_split(self, text):
        """
        Find clause boundaries based on conjunctions and punctuation.    
        Note we don't need to add periods to the rules because we iterate through
        the sentences in the documents. 
        """
        if not self.nlp:
            return None
        
        doc = self.nlp(text)
        clauses = []
        
        for sentence in doc.sents:
            
            sent_clauses = []
            current_tokens = []
            
            for token in sentence:
                current_tokens.append(token.text)

                #check for what decides a new clause
                if (token.dep_ in ['cc', 'mark'] or  #coordinating/subordinating conjunctions
                    token.text in [',', ';', ':'] or
                    token.pos_ == 'SCONJ'):  #subordinating conjunction
                    
                    if len(current_tokens) > 1:  #dont create single-word clauses
                        clause_text = ' '.join(current_tokens[:-1]).strip() #everything before curent token - that will go to the next clause
                        if clause_text:
                            sent_clauses.append(clause_text)
                        current_tokens = [token.text] if token.text not in [',', ';', ':'] else [] #boundary token if not punctuation

            #sentence over, add remaining tokens of sentence as final clause
            if current_tokens:
                clause_text = ' '.join(current_tokens).strip()
                if clause_text:
                    sent_clauses.append(clause_text)
    
            #safeguard
            if not sent_clauses:
                sent_clauses.append(sentence.text.strip())
                
            clauses.extend(sent_clauses)
        
        return clauses

In [6]:
dataset = load_dataset("roneneldan/TinyStories", split="train[:5]")  # first 5 stories

separator = ClauseSeparator()
advanced_clauses = separator.clause_split(dataset["text"][0])
for i, clause in enumerate(advanced_clauses, 1):
    print(f"{i}. {clause}")

1. One day
2. a little girl named Lily found a needle in her room .
3. She knew it was difficult to play with it
4. because it was sharp .
5. Lily wanted to share the needle with her mom
6. so she could sew a button on her shirt .
7. Lily went to her mom
8. and said
9. " Mom
10. I found this needle .
11. Can you share it with me
12. and sew my shirt ? "
13. Her mom smiled
14. and said
15. " Yes
16. Lily
17. we can share the needle
18. and fix your shirt .
19. " 

 Together
20. they shared the needle
21. and sewed the button on Lily 's shirt .
22. It was not difficult for them
23. because they were sharing
24. and helping each other .
25. After they finished
26. Lily thanked her mom for sharing the needle
27. and fixing her shirt .
28. They both felt happy
29. because they had shared
30. and worked together .


#### Fix for weird new line tokens

vibe-coded fix

In [31]:
class ClauseSeparator:
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None

    def clause_split(self, text):
        if not self.nlp:
            return None

        # Split on two-or-more newlines to preserve paragraph boundaries
        paragraphs = re.split(r'\n{2,}', text)
        all_clauses = []

        for para in paragraphs:
            # collapse single newlines inside a paragraph into spaces (soft line breaks)
            para = re.sub(r'(?<!\n)\n(?!\n)', ' ', para).strip()
            if not para:
                continue

            doc = self.nlp(para)
            para_clauses = []

            for sent in doc.sents:
                sent_clauses = []
                current_tokens = []

                for token in sent:
                    current_tokens.append(token.text)

                    # treat punctuation and quotes as boundary chars for resetting
                    boundary_is_punct = token.text in [',', ';', ':', '"', '“', '”', "'"]

                    if (token.dep_ in ['cc', 'mark'] or
                        boundary_is_punct or
                        token.pos_ == 'SCONJ'):

                        # append clause formed by tokens up to (but not including) boundary token
                        if len(current_tokens) > 1:
                            clause_text = ' '.join(current_tokens[:-1]).strip()
                            if clause_text:
                                sent_clauses.append(clause_text)

                        # reset buffer: drop punctuation, keep conjunctions/markers as start of next clause
                        if boundary_is_punct:
                            current_tokens = []
                        else:
                            current_tokens = [token.text]

                # Add remaining tokens as final clause only if they have >1 token.
                # We'll handle single-token sentence fragments in a small postprocessing step below.
                if len(current_tokens) > 1:
                    clause_text = ' '.join(current_tokens).strip()
                    if clause_text:
                        sent_clauses.append(clause_text)

                # If no clause was produced for this sentence, fall back to the full sentence text
                if not sent_clauses:
                    sent_clauses.append(sent.text.strip())

                para_clauses.extend(sent_clauses)

                        # --- POST-PROCESSING: spaCy-aware merge of standalone single-token clauses forward ---
            merged = []
            i = 0
            while i < len(para_clauses):
                cur = para_clauses[i]
                is_single_token = (len(cur.split()) == 1)
                is_quoted = (cur.startswith('"') or cur.startswith("'") or cur.endswith('"') or cur.endswith("'"))

                if is_single_token and not is_quoted and i + 1 < len(para_clauses):
                    nxt = para_clauses[i + 1].lstrip()
                    # safe check: next clause must exist and start with a character
                    if nxt:
                        first_char_next = nxt[0]
                        # analyze the single token with spaCy to decide if it's a name/vocative/interjection
                        try:
                            tok = self.nlp(cur.strip())[0]
                            pos = tok.pos_
                            lower_form = tok.lower_
                        except Exception:
                            pos = None
                            lower_form = cur.strip().lower()

                        # treat explicit responses/interjections as KEEP (do not merge)
                        interjection_whitelist = {"yes", "no", "okay", "ok", "thanks", "thankyou"}
                        is_interjection = (pos == 'INTJ') or (lower_form in interjection_whitelist)

                        # treat proper nouns (names) and titles as vocatives: KEEP (do not merge)
                        is_proper_name = (pos == 'PROPN')

                        # DECISION: merge forward only if:
                        #  - current token is not a proper name (PROPN)
                        #  - current token is not an interjection/response
                        #  - and the next clause starts lowercase (indicates continuation)
                        if (not is_proper_name) and (not is_interjection) and first_char_next.islower():
                            merged.append(cur + ' ' + para_clauses[i + 1])
                            i += 2
                            continue

                # default: keep current clause as-is
                merged.append(cur)
                i += 1

            all_clauses.extend(merged)

        return all_clauses



In [61]:
separator = ClauseSeparator()
advanced_clauses = separator.clause_split(dataset["text"][0])
for i, clause in enumerate(advanced_clauses, 1):
    print(f"{i}. {clause}")


1. One day
2. a little girl named Lily found a needle in her room .
3. She knew it was difficult to play with it
4. because it was sharp .
5. Lily wanted to share the needle with her mom
6. so she could sew a button on her shirt .
7. Lily went to her mom
8. and said
9. Mom
10. I found this needle .
11. Can you share it with me
12. and sew my shirt ?
13. Her mom smiled
14. and said
15. Yes
16. Lily we can share the needle
17. and fix your shirt .
18. Together they shared the needle
19. and sewed the button on Lily 's shirt .
20. It was not difficult for them
21. because they were sharing
22. and helping each other .
23. After they finished
24. Lily thanked her mom for sharing the needle
25. and fixing her shirt .
26. They both felt happy
27. because they had shared
28. and worked together .


## Evaluation of clause separator

#### Parsing dataset for task

In [6]:
eval_set = load_dataset("troianea/CLAUSE-ATLAS")

In [7]:
df = eval_set["train"].to_pandas()
print(df.head())

                               book chapter_id  paragraph_id  clause_number  \
0  Alice's Adventures in Wonderland          1             1              0   
1  Alice's Adventures in Wonderland          1             1              1   
2  Alice's Adventures in Wonderland          1             1              2   
3  Alice's Adventures in Wonderland          1             2              3   
4  Alice's Adventures in Wonderland          1             2              4   

                                                text prompt_one prompt_two  \
0  Alice was beginning to get very tired of sitti...          C          C   
1  once or twice she had peeped into the book her...          S          C   
2  “and what is the use of a book,” thought Alice...          S          S   
3  So she was considering in her own mind (as wel...          S          S   
4  whether the pleasure of making a daisy-chain w...          S          S   

  prompt_three annotator_one annotator_two annotator_thr

In [9]:
df_filtered = df[['book','paragraph_id', 'clause_number','text']]

#lets use first book for now
first_book = df_filtered['book'].iat[0]

#make sure columns are int
df_filtered_book = df_filtered[df_filtered['book'] == first_book].copy()
df_filtered_book['paragraph_id_num'] = pd.to_numeric(df_filtered_book['paragraph_id'], errors='coerce')
df_filtered_book['clause_number_num'] = pd.to_numeric(df_filtered_book['clause_number'], errors='coerce')
#and sorted
df_book = df_filtered_book.sort_values(['paragraph_id_num', 'clause_number_num'])

#merge text by paragraph_id
merged = (
    df_book
    .groupby('paragraph_id', sort=True, as_index=False)
    .agg(paragraph_text=('text', lambda texts: " ".join(t.strip() for t in texts)))
)

#### Creating ML dataset - X, Y structure

In [18]:
#merge by paragraph id, put clauses into lists
paragraph_clauses = (
    df_book
    .groupby('paragraph_id', sort=True)['text']
    .apply(lambda texts: list(texts))
    .reset_index(name='clauses')
)

paragraph_clauses['paragraph_text'] = paragraph_clauses['clauses'].apply(lambda cl: " ".join(cl))

paragraph_clauses.head()

Unnamed: 0,paragraph_id,clauses,paragraph_text
0,1,[Alice was beginning to get very tired of sitt...,Alice was beginning to get very tired of sitti...
1,2,[So she was considering in her own mind (as we...,So she was considering in her own mind (as wel...
2,3,[There was nothing so very remarkable in that;...,There was nothing so very remarkable in that; ...
3,4,"[In another moment down went Alice after it,, ...","In another moment down went Alice after it, ne..."
4,5,[The rabbit-hole went straight on like a tunne...,The rabbit-hole went straight on like a tunnel...


In [51]:
class Claude_Advanced_ClauseSeparator:
    def __init__(self, size='small'):
        if size == 'small':
            self.nlp = spacy.load("en_core_web_sm")
        elif size == 'large':
            self.nlp = spacy.load("en_core_web_trf")
        else:
            print("Choose an appropriate spaCy model. Ex install: python -m spacy download en_core_web_sm")
            self.nlp = None
    
    def _find_matching_bracket(self, text, start_pos, open_char):
        """Find matching closing bracket/quote"""
        close_chars = {'(': ')', '"': '"', "'": "'", '[': ']', '{': '}'}
        close_char = close_chars.get(open_char, open_char)
        
        if open_char in '"\'':
            # For quotes, find the next occurrence
            next_pos = text.find(close_char, start_pos + 1)
            return next_pos if next_pos != -1 else len(text) - 1
        else:
            # For brackets, handle nesting
            count = 1
            for i in range(start_pos + 1, len(text)):
                if text[i] == open_char:
                    count += 1
                elif text[i] == close_char:
                    count -= 1
                    if count == 0:
                        return i
            return len(text) - 1
    
    def clause_split(self, text):
        """
        Split text into clauses using a simpler, more reliable approach.
        """
        if not self.nlp:
            return None
        
        doc = self.nlp(text)
        clauses = []
        
        for sentence in doc.sents:
            sentence_text = sentence.text.strip()
            if not sentence_text:
                continue
            
            # Find all potential split points
            split_points = []
            i = 0
            paren_depth = 0
            quote_depth = 0
            quote_char = None
            
            # First, find parenthetical expressions to treat as separate clauses
            parenthetical_ranges = []
            temp_i = 0
            while temp_i < len(sentence_text):
                if sentence_text[temp_i] == '(':
                    end_pos = self._find_matching_bracket(sentence_text, temp_i, '(')
                    parenthetical_ranges.append((temp_i, end_pos))
                    temp_i = end_pos + 1
                else:
                    temp_i += 1
            
            while i < len(sentence_text):
                char = sentence_text[i]
                
                # Track parentheses and quotes
                if char == '(' and quote_depth == 0:
                    paren_depth += 1
                elif char == ')' and quote_depth == 0:
                    paren_depth -= 1
                elif char in '"\'':
                    if quote_depth == 0:
                        quote_depth = 1
                        quote_char = char
                    elif char == quote_char:
                        quote_depth = 0
                        quote_char = None
                
                # Only split if we're not inside parentheses or quotes
                if paren_depth == 0 and quote_depth == 0:
                    # Check for clause-separating punctuation
                    if char in ',;:':
                        # Look at what comes after
                        next_part = sentence_text[i+1:].strip()
                        if next_part:
                            # Check if this is a genuine clause boundary
                            if char == ',' and self._is_clause_boundary_comma(sentence_text, i):
                                split_points.append(i + 1)  # Split after comma
                            elif char in ';:':
                                split_points.append(i + 1)  # Split after semicolon/colon
                
                i += 1
            
            # Add parenthetical split points
            for start, end in parenthetical_ranges:
                if start > 0:
                    split_points.append(start)  # Split before parenthetical
                split_points.append(end + 1)  # Split after parenthetical
            
            # Also check for "and" and "with" that start new clauses
            and_positions = []
            words = sentence_text.split()
            current_pos = 0
            for word in words:
                word_start = sentence_text.find(word, current_pos)
                if word.lower() in ['and', 'with'] and self._should_split_on_word(sentence_text, word_start, word.lower()):
                    # Check we're not inside parentheses
                    temp_paren = 0
                    temp_quote = 0
                    temp_quote_char = None
                    for j in range(word_start):
                        if sentence_text[j] == '(':
                            temp_paren += 1
                        elif sentence_text[j] == ')':
                            temp_paren -= 1
                        elif sentence_text[j] in '"\'':
                            if temp_quote == 0:
                                temp_quote = 1
                                temp_quote_char = sentence_text[j]
                            elif sentence_text[j] == temp_quote_char:
                                temp_quote = 0
                    
                    if temp_paren == 0 and temp_quote == 0:
                        and_positions.append(word_start)
                
                current_pos = word_start + len(word)
            
            split_points.extend(and_positions)
            split_points = sorted(set(split_points))
            
            # Split the sentence at the identified points
            if not split_points:
                clauses.append(sentence_text)
            else:
                start = 0
                for split_point in split_points:
                    clause = sentence_text[start:split_point].strip()
                    if clause:
                        clauses.append(clause)
                    start = split_point
                
                # Add the remaining part
                final_clause = sentence_text[start:].strip()
                if final_clause:
                    clauses.append(final_clause)
        
        return clauses
    
    def _is_clause_boundary_comma(self, text, comma_pos):
        """
        Determine if a comma represents a clause boundary.
        """
        # Get the part after the comma
        after_comma = text[comma_pos + 1:].strip()
        if not after_comma:
            return False
    
    def _should_split_on_word(self, text, word_pos, word):
        """Check if we should split on words like 'and' or 'with'"""
        after_word = text[word_pos + len(word):].strip()
        
        if word == 'and':
            # Split on "and" if it starts a new independent clause
            # Look for pattern: "and [noun/pronoun] [verb]"
            if after_word:
                try:
                    doc = self.nlp(after_word)
                    tokens = [t for t in doc if not t.is_space and t.pos_ != 'PUNCT']
                    if len(tokens) >= 2:
                        if (tokens[0].pos_ in ['NOUN', 'PRON', 'PROPN'] and 
                            tokens[1].pos_ in ['VERB', 'AUX']):
                            return True
                except:
                    pass
        elif word == 'with':
            # Split on "with" if it starts a prepositional phrase that functions as a new clause
            if 'words' in after_word[:20]:  # specific to this example
                return True
        
        return False

# Test the improved separator
separator = Claude_Advanced_ClauseSeparator()

test_text = '''There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ("which certainly was not here before," said Alice,) and round the neck of the bottle was a paper label, with the words "DRINK ME," beautifully printed on it in large letters.'''

result = separator.clause_split(test_text)

print("Model Output:")
for i, clause in enumerate(result):
    print(f"{i+1}: '{clause}'")

print("\nGold Standard:")
gold = ['There seemed to be no use in waiting by the little door,', 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:', 'this time she found a little bottle on it,', '("which certainly was not here before," said Alice,)', 'and round the neck of the bottle was a paper label,', 'with the words "DRINK ME," beautifully printed on it in large letters.']
for i, clause in enumerate(gold):
    print(f"{i+1}: '{clause}'")

Model Output:
1: 'There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:'
2: 'this time she found a little bottle on it,'
3: '("which certainly was not here before," said Alice,)'
4: 'and round the neck of the bottle was a paper label,'
5: 'with the words "DRINK ME," beautifully printed on it in large letters.'

Gold Standard:
1: 'There seemed to be no use in waiting by the little door,'
2: 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:'
3: 'this time she found a little bottle on it,'
4: '("which certainly was not here before," said Alice,)'
5: 'and round the neck of the bottle was a paper label,'
6: 'with the words "DRINK ME," beautifully printed on it in large letters.'


In [50]:
class Claude_Basic_ClauseSeparator:
    def __init__(self, size='small'):
        if size == 'small':
            self.nlp = spacy.load("en_core_web_sm")
        elif size == 'large':
            self.nlp = spacy.load("en_core_web_trf")
        else:
            print("Choose an appropriate spaCy model. Ex install: python -m spacy download en_core_web_sm")
            self.nlp = None
    
    def clause_split(self, text):
        """
        Find clause boundaries based on conjunctions and punctuation.    
        """
        if not self.nlp:
            return None
        
        doc = self.nlp(text)
        clauses = []
        
        for sentence in doc.sents:
            sent_clauses = []
            current_tokens = []
            
            # Track parentheses to avoid splitting inside them
            paren_depth = 0
            
            for i, token in enumerate(sentence):
                current_tokens.append(token.text)
                
                # Track parentheses
                if token.text == '(':
                    paren_depth += 1
                elif token.text == ')':
                    paren_depth -= 1
                
                should_split = False
                
                # Only split if we're not inside parentheses
                if paren_depth == 0:
                    # Split on semicolon and colon always
                    if token.text in [';', ':']:
                        should_split = True
                    
                    # Split on comma only in specific cases
                    elif token.text == ',':
                        # Look at next non-space token
                        next_token = None
                        for j in range(i + 1, len(sentence)):
                            if sentence[j].text.strip():
                                next_token = sentence[j]
                                break
                        
                        if next_token:
                            # Split if comma is followed by these clause starters
                            if next_token.text.lower() in ['so', 'this']:
                                should_split = True
                    
                    # Split on 'and' when it starts new independent clause
                    elif token.text.lower() == 'and':
                        # Look ahead for subject-verb pattern
                        if i + 2 < len(sentence):
                            next1 = sentence[i + 1]
                            next2 = sentence[i + 2]
                            if (next1.pos_ in ['NOUN', 'PRON', 'PROPN'] and 
                                next2.pos_ in ['VERB', 'AUX']):
                                should_split = True
                
                # Make the split
                if should_split and len(current_tokens) > 1:
                    if token.text in [',', ';', ':']:
                        # Include punctuation with previous clause
                        clause_text = ' '.join(current_tokens).strip()
                        if clause_text:
                            sent_clauses.append(clause_text)
                        current_tokens = []
                    else:
                        # Don't include conjunction with previous clause
                        clause_text = ' '.join(current_tokens[:-1]).strip()
                        if clause_text:
                            sent_clauses.append(clause_text)
                        current_tokens = [token.text]

            # Add remaining tokens as final clause
            if current_tokens:
                clause_text = ' '.join(current_tokens).strip()
                if clause_text:
                    sent_clauses.append(clause_text)

            # Handle parenthetical expressions as separate clauses
            final_clauses = []
            for clause in sent_clauses:
                if '(' in clause and ')' in clause:
                    # Split out parenthetical
                    parts = []
                    current = ""
                    paren_start = -1
                    
                    for i, char in enumerate(clause):
                        if char == '(' and paren_start == -1:
                            if current.strip():
                                parts.append(current.strip())
                            paren_start = i
                            current = char
                        elif char == ')' and paren_start != -1:
                            current += char
                            parts.append(current.strip())
                            current = ""
                            paren_start = -1
                        else:
                            current += char
                    
                    if current.strip():
                        parts.append(current.strip())
                    
                    final_clauses.extend(parts)
                else:
                    final_clauses.append(clause)

            clauses.extend(final_clauses)
        
        return clauses


# Test
separator = Claude_Basic_ClauseSeparator()

test_text = '''There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ("which certainly was not here before," said Alice,) and round the neck of the bottle was a paper label, with the words "DRINK ME," beautifully printed on it in large letters.'''

result = separator.clause_split(test_text)

print("Model Output:")
for i, clause in enumerate(result):
    print(f"{i+1}: '{clause}'")

print("\nGold Standard:")
gold = ['There seemed to be no use in waiting by the little door,', 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:', 'this time she found a little bottle on it,', '("which certainly was not here before," said Alice,)', 'and round the neck of the bottle was a paper label,', 'with the words "DRINK ME," beautifully printed on it in large letters.']
for i, clause in enumerate(gold):
    print(f"{i+1}: '{clause}'")

Model Output:
1: 'There seemed to be no use in waiting by the little door ,'
2: 'so she went back to the table , half hoping she might find another key on it , or at any rate a book of rules for shutting people up like telescopes :'
3: 'this time she found a little bottle on it ,'
4: '( " which certainly was not here before , " said Alice , )'
5: 'and round the neck of the bottle was a paper label , with the words " DRINK ME , " beautifully printed on it in large letters .'

Gold Standard:
1: 'There seemed to be no use in waiting by the little door,'
2: 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:'
3: 'this time she found a little bottle on it,'
4: '("which certainly was not here before," said Alice,)'
5: 'and round the neck of the bottle was a paper label,'
6: 'with the words "DRINK ME," beautifully printed on it in large letters.'


In [53]:
# Test
separator = ClauseSeparator()

test_text = '''There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ("which certainly was not here before," said Alice,) and round the neck of the bottle was a paper label, with the words "DRINK ME," beautifully printed on it in large letters.'''

result = separator.clause_split(test_text)

print("Model Output:")
for i, clause in enumerate(result):
    print(f"{i+1}: '{clause}'")

print("\nGold Standard:")
gold = ['There seemed to be no use in waiting by the little door,', 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:', 'this time she found a little bottle on it,', '("which certainly was not here before," said Alice,)', 'and round the neck of the bottle was a paper label,', 'with the words "DRINK ME," beautifully printed on it in large letters.']
for i, clause in enumerate(gold):
    print(f"{i+1}: '{clause}'")

Model Output:
1: 'There seemed to be no use in waiting by the little door'
2: 'so she went back to the table'
3: 'half hoping she might find another key on it'
4: 'or at any rate a book of rules for shutting people up like telescopes'
5: 'this time she found a little bottle on it'
6: '( " which certainly was not here before'
7: '" said Alice'
8: ')'
9: 'and round the neck of the bottle was a paper label'
10: 'with the words " DRINK ME'
11: '" beautifully printed on it in large letters .'

Gold Standard:
1: 'There seemed to be no use in waiting by the little door,'
2: 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:'
3: 'this time she found a little bottle on it,'
4: '("which certainly was not here before," said Alice,)'
5: 'and round the neck of the bottle was a paper label,'
6: 'with the words "DRINK ME," beautifully printed on it in large letters.'


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# function to compute clause-level metrics for one paragraph
def clause_metrics(pred_clauses, gold_clauses):
    # treat each predicted clause as "predicted"; match against gold clauses
    # we allow each gold clause to match at most one predicted clause
    gold_remaining = gold_clauses.copy()
    tp = 0
    for p in pred_clauses:
        if p in gold_remaining:
            tp += 1
            gold_remaining.remove(p)
    fp = len(pred_clauses) - tp
    fn = len(gold_clauses) - tp
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return precision, recall, f1

separator = ClauseSeparator()

num_paragraphs = 20  # evaluate first 5 paragraphs
results = []

for i in range(num_paragraphs):
    gold_clauses = paragraph_clauses['clauses'][i]
    paragraph_text = paragraph_clauses['paragraph_text'][i]
    
    pred_clauses = separator.clause_split(paragraph_text)
    
    precision, recall, f1 = clause_metrics(pred_clauses, gold_clauses)
    
    results.append({
        'paragraph_id': paragraph_clauses['paragraph_id'][i],
        'pred_clauses': pred_clauses,
        'gold_clauses': gold_clauses,
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,paragraph_id,pred_clauses,gold_clauses,precision,recall,f1
0,1,[Alice was beginning to get very tired of sitt...,[Alice was beginning to get very tired of sitt...,0.0,0.0,0.0
1,2,"[So she was considering in her own mind, ( as ...",[So she was considering in her own mind (as we...,0.0,0.0,0.0
2,3,[There was nothing so very remarkable in that ...,[There was nothing so very remarkable in that;...,0.0,0.0,0.0
3,4,"[In another moment down went Alice after it , ...","[In another moment down went Alice after it,, ...",0.0,0.0,0.0
4,5,[The rabbit - hole went straight on like a tun...,[The rabbit-hole went straight on like a tunne...,0.0,0.0,0.0
5,6,"[Either the well was very deep , or she fell v...","[Either the well was very deep,, or she fell v...",0.0,0.0,0.0
6,7,"[“ Well ! ” thought Alice to herself , “ after...","[“Well!” thought Alice to herself,, “after suc...",0.0,0.0,0.0
7,8,"[Down , down , down ., Would the fall never co...","[Down, down, down., Would the fall never come ...",0.0,0.0,0.0
8,9,"[Presently she began again ., “ I wonder if I ...","[Presently she began again., “I wonder if I sh...",0.0,0.0,0.0
9,10,"[Down , down , down ., There was nothing else ...","[Down, down, down., There was nothing else to ...",0.0,0.0,0.0


Using my clause separator

In [29]:
results_df.iloc[14]['pred_clauses']

['There seemed to be no use in waiting by the little door',
 'so she went back to the table',
 'half hoping she might find another key on it',
 'or at any rate a book of rules for shutting people up like telescopes',
 'this time she found a little bottle on it',
 '( “ which certainly was not here before',
 '” said Alice',
 ')',
 'and round the neck of the bottle was a paper label',
 'with the words “ DRINK ME',
 '” beautifully printed on it in large letters .']

In [30]:
results_df.iloc[14]['gold_clauses']

['There seemed to be no use in waiting by the little door,',
 'so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes:',
 'this time she found a little bottle on it,',
 '(“which certainly was not here before,” said Alice,)',
 'and round the neck of the bottle was a paper label,',
 'with the words “DRINK ME,” beautifully printed on it in large letters.']