# **2. Sentence Splitting and Alignment**


In [1]:
import yaml

# Load configuration from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

## **Sentence Splitting**

Reads cleaned book summaries, segments them into sentences using SaT (Segment Any Text) and saves the results as a parquet file (one sentence per row).

Output:
- **isbn**: book identifier
- **version**: 1 | 2 | 3 (corresponding to different book description versions)
- **sent_id**: sentence index within each version
- **sentence**: the segmented sentence text

In [4]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import torch

def split_sentences_sat(
    input_path: str,
    output_path: str,
    model_name: str = "sat-3l-sm",
    use_gpu: bool = True,
    batch_size: int = 32,
):
    # Load cleaned data
    df = pd.read_parquet(input_path)
    # df = df.head(1500)

    # Load SaT model
    from wtpsplit import SaT
    print(f"Loading SaT model: {model_name}")
    sat = SaT(model_name)
    
    # Optional GPU acceleration
    if use_gpu and torch.cuda.is_available():
        print("Using GPU acceleration")
        sat.half().to("cuda")
    else:
        print("Using CPU processing")
        
    print("SaT model loaded successfully")
        
    # Helper functions
    def prepare_texts_for_processing(df):
        """
        Prepare texts for batch processing with SaT.
        Returns list of (text, metadata) tuples.
        """
        texts_with_metadata = []
        
        for _, row in df.iterrows():
            for ver_idx, col in enumerate(("version1", "version2", "version3"), start=1):
                text = row[col]
                
                # Skip empty/null texts
                if pd.isna(text) or not text.strip():
                    continue
                
                # Store text with its metadata
                metadata = {
                    "isbn": row["isbn"],
                    "version": ver_idx,
                    "original_text": text
                }
                texts_with_metadata.append((text.strip(), metadata))
        
        return texts_with_metadata

    def process_batch(texts_batch, metadata_batch):
        """
        Process a batch of texts with SaT and return records.
        """
        records = []
    
        # Get just the text strings for SaT
        text_strings = [text for text, _ in texts_batch]
        
        # Process batch with SaT 
        sentence_lists = list(sat.split(text_strings))
        
        # Convert results to records
        for (text, metadata), sentences in zip(texts_batch, sentence_lists):
            for sent_id, sentence in enumerate(sentences):
                # Clean up sentence 
                sentence = sentence.strip()
                
                if sentence:  # Skip empty sentences
                    records.append({
                        "isbn": metadata["isbn"],
                        "version": metadata["version"],
                        "sent_id": sent_id,
                        "sentence": sentence,
                    })
        
        return records

    # Process all texts in batches 
    print("Preparing texts for processing...")
    texts_with_metadata = prepare_texts_for_processing(df)
    
    print(f"Processing {len(texts_with_metadata)} texts in batches of {batch_size}")
    
    all_records = []
    processing_stats = {
        'total_texts': len(texts_with_metadata),
        'successful_batches': 0,
        'failed_batches': 0,
        'total_sentences': 0
    }
    
    for i in tqdm(range(0, len(texts_with_metadata), batch_size), desc="Processing batches"):
        batch = texts_with_metadata[i:i + batch_size]
        
        try:
            batch_records = process_batch(batch, None)  # metadata is in the batch tuples
            all_records.extend(batch_records)
            processing_stats['successful_batches'] += 1
            processing_stats['total_sentences'] += len(batch_records)
            
        except Exception as e:
            print(f"Failed to process batch {i//batch_size + 1}: {e}")
            processing_stats['failed_batches'] += 1

    # Save results and report 
    sent_df = pd.DataFrame.from_records(all_records)
    sent_df.to_parquet(output_path, index=False)
    
    print(f"Saved {len(sent_df):,} sentences → {output_path}")
    print(f"Processing Statistics:")
    print(f"Model used: {model_name}")
    print(f"GPU acceleration: {'Yes' if use_gpu and torch.cuda.is_available() else 'No'}")
    print(f"Total texts processed: {processing_stats['total_texts']}")
    print(f"Successful batches: {processing_stats['successful_batches']}")
    print(f"Failed batches: {processing_stats['failed_batches']}")
    print(f"Total sentences generated: {processing_stats['total_sentences']}")
    print(f"Average sentences per text: {processing_stats['total_sentences'] / max(processing_stats['total_texts'], 1):.1f}")


# Run 
print("\n" + "="*50)
print("Starting main sentence splitting process...")


split_sentences_sat(
    config["cleaned_file"],
    config["sentences_file"], 
    model_name="sat-12l-sm",  
    use_gpu=True,
    batch_size=16,
)


Starting main sentence splitting process...
Loading SaT model: sat-12l-sm
Using GPU acceleration
SaT model loaded successfully
Preparing texts for processing...
Processing 70876 texts in batches of 16


Processing batches: 100%|██████████| 4430/4430 [01:41<00:00, 43.55it/s]


Saved 585,824 sentences → data/sentences.parquet
Processing Statistics:
Model used: sat-12l-sm
GPU acceleration: Yes
Total texts processed: 70876
Successful batches: 4430
Failed batches: 0
Total sentences generated: 585824
Average sentences per text: 8.3


## **Sentence Alignment**

This script aligns sentences between different versions of Dutch book summaries using a multi-stage approach:

- **Stage 1**: Exact matches
- **Stage 2**: High semantic + character similarity (via multilingual transformer & Levenshtein ratio)
- **Stage 3**: Heuristic based on relative sentence position
- **Stage 4**: Detects insertions and deletions

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from Levenshtein import ratio as levenshtein_ratio
from typing import List, Tuple, Optional
import torch
from tqdm import tqdm

class DutchSentenceAligner:
    def __init__(self, model_name='paraphrase-multilingual-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)
        self.semantic_weight = 0.7
        self.character_weight = 0.3
        
    def compute_similarity_matrix(self, sentences1: List[str], sentences2: List[str]) -> np.ndarray:
        # Compute sentence embeddings
        embeddings1 = self.model.encode(sentences1, convert_to_tensor=True, show_progress_bar=False)
        embeddings2 = self.model.encode(sentences2, convert_to_tensor=True, show_progress_bar=False)
        
        # Compute semantic similarity
        semantic_sim = torch.cosine_similarity(
            embeddings1.unsqueeze(1), 
            embeddings2.unsqueeze(0), 
            dim=2
        ).cpu().numpy()
        
        # Compute character-based similarity
        char_sim = np.zeros((len(sentences1), len(sentences2)))
        for i, s1 in enumerate(sentences1):
            for j, s2 in enumerate(sentences2):
                char_sim[i, j] = levenshtein_ratio(s1, s2)
        
        # Combine similarities
        combined_sim = (self.semantic_weight * semantic_sim + 
                       self.character_weight * char_sim)
        
        return combined_sim
    
    def align_sentences(self, sentences1: List[str], sentences2: List[str], 
                       threshold: float = 0.7) -> List[dict]:
        alignments = []
        used_indices1 = set()
        used_indices2 = set()
        
        # Stage 1: Exact matches
        for i, s1 in enumerate(sentences1):
            for j, s2 in enumerate(sentences2):
                if j not in used_indices2 and s1 == s2:
                    alignments.append({
                        'old_sent_id': i,
                        'new_sent_id': j,
                        'old_sentence': s1,
                        'new_sentence': s2,
                        'similarity': 1.0,
                        'alignment_type': 'exact'
                    })
                    used_indices1.add(i)
                    used_indices2.add(j)
                    break
        
        # Compute similarity matrix for remaining sentences
        remaining1 = [(i, s) for i, s in enumerate(sentences1) if i not in used_indices1]
        remaining2 = [(j, s) for j, s in enumerate(sentences2) if j not in used_indices2]
        
        if remaining1 and remaining2:
            similarity_matrix = self.compute_similarity_matrix(
                [s for _, s in remaining1],
                [s for _, s in remaining2]
            )
            
            # Stage 2: High similarity matches (greedy approach)
            while True:
                # Find the highest similarity pair
                max_sim = similarity_matrix.max()
                if max_sim < threshold:
                    break
                    
                max_idx = np.unravel_index(similarity_matrix.argmax(), similarity_matrix.shape)
                i_idx, j_idx = max_idx
                
                i_original = remaining1[i_idx][0]
                j_original = remaining2[j_idx][0]
                
                alignments.append({
                    'old_sent_id': i_original,
                    'new_sent_id': j_original,
                    'old_sentence': remaining1[i_idx][1],
                    'new_sentence': remaining2[j_idx][1],
                    'similarity': float(max_sim),
                    'alignment_type': 'similar'
                })
                
                # Mark as used
                similarity_matrix[i_idx, :] = -1
                similarity_matrix[:, j_idx] = -1
                used_indices1.add(i_original)
                used_indices2.add(j_original)
        
        # Stage 3: Order-based heuristic for remaining sentences
        # If sentences maintain similar positions, they might be related even with lower similarity
        remaining1 = [(i, s) for i, s in enumerate(sentences1) if i not in used_indices1]
        remaining2 = [(j, s) for j, s in enumerate(sentences2) if j not in used_indices2]
        
        if remaining1 and remaining2:
            for (i, s1) in remaining1:
                # Look for sentences at similar relative positions
                relative_pos1 = i / len(sentences1)
                best_match = None
                best_sim = 0
                
                for (j, s2) in remaining2:
                    if j in used_indices2:
                        continue
                    relative_pos2 = j / len(sentences2)
                    position_diff = abs(relative_pos1 - relative_pos2)
                    
                    # Only consider if positions are relatively close
                    if position_diff <= 0.2:
                        sim = self.compute_similarity_matrix([s1], [s2])[0, 0]
                        if sim > best_sim and sim > 0.5:  # Lower threshold for position-based matching
                            best_match = (j, s2, sim)
                            best_sim = sim
                
                if best_match:
                    j, s2, sim = best_match
                    alignments.append({
                        'old_sent_id': i,
                        'new_sent_id': j,
                        'old_sentence': s1,
                        'new_sentence': s2,
                        'similarity': float(sim),
                        'alignment_type': 'position_based'
                    })
                    used_indices1.add(i)
                    used_indices2.add(j)
        
        # Stage 4: Handle deletions and insertions
        for i, s in enumerate(sentences1):
            if i not in used_indices1:
                alignments.append({
                    'old_sent_id': i,
                    'new_sent_id': None,
                    'old_sentence': s,
                    'new_sentence': None,
                    'similarity': 0.0,
                    'alignment_type': 'deletion'
                })
        
        for j, s in enumerate(sentences2):
            if j not in used_indices2:
                alignments.append({
                    'old_sent_id': None,
                    'new_sent_id': j,
                    'old_sentence': None,
                    'new_sentence': s,
                    'similarity': 0.0,
                    'alignment_type': 'insertion'
                })
        
        return alignments
    
    def align_versions(self, df: pd.DataFrame, isbn: int, from_version: int, 
                      to_version: int) -> List[dict]:
        # Get sentences for each version
        v1_sentences = df[(df['isbn'] == isbn) & 
                         (df['version'] == from_version)].sort_values('sent_id')
        v2_sentences = df[(df['isbn'] == isbn) & 
                         (df['version'] == to_version)].sort_values('sent_id')
        
        # Extract sentence texts
        sentences1 = v1_sentences['sentence'].tolist()
        sentences2 = v2_sentences['sentence'].tolist()

        # If either version has no sentences, nothing to align
        if not sentences1 or not sentences2:
            return []                    
            
        # Perform alignment
        alignments = self.align_sentences(sentences1, sentences2)
        
        # Add metadata
        for alignment in alignments:
            alignment['isbn'] = isbn
            alignment['from_version'] = from_version
            alignment['to_version'] = to_version
        
        return alignments
    
    def align_all_books(self, df: pd.DataFrame) -> pd.DataFrame:
        all_alignments = []
        isbns = df['isbn'].unique()

        for isbn in tqdm(isbns, desc="Processing books"):
            versions = set(df.loc[df['isbn'] == isbn, 'version'])

            # v1 and v2 present 
            if {1, 2}.issubset(versions):
                all_alignments += self.align_versions(df, isbn, 1, 2)

                # v3 as well? then also 2 → 3
                if 3 in versions:
                    all_alignments += self.align_versions(df, isbn, 2, 3)

            # v2 missing but v1 & v3 present 
            elif {1, 3}.issubset(versions):
                all_alignments += self.align_versions(df, isbn, 1, 3)


        alignment_df = pd.DataFrame(all_alignments)
        
        # Reorder columns
        column_order = ['isbn', 'from_version', 'to_version', 'old_sent_id', 
                       'new_sent_id', 'old_sentence', 'new_sentence', 
                       'similarity', 'alignment_type']
        alignment_df = alignment_df[column_order]
        
        return alignment_df


# Load data
df = pd.read_parquet(config['sentences_file'])

print(f"Loaded {len(df)} sentences from {df['isbn'].nunique()} books.")
    
# Initialize the aligner
aligner = DutchSentenceAligner()

# Perform alignment
alignments = aligner.align_all_books(df)

# Save results
alignments.to_parquet(config['aligned_file'], index=False)

# Print some statistics
print(f"Total alignments: {len(alignments)}")
print(f"Alignment type distribution:")
print(alignments['alignment_type'].value_counts())
print(f"\nAverage similarity by type:")
print(alignments.groupby('alignment_type')['similarity'].mean())