# Testing explored segmentation methods on synthetic data
- Adaptive threshold segmentation
- TextTiling
- Sequential thresholding
- Adaptive sequential thresholding
- BATS

This subject is still not finished - the idea for better data generation and re-done testing is still to de implemented.

In [1]:
import pandas as pd
import numpy as np
import re
import ast
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def adaptive_threshold_segmentation(embeddings, method="percentile", min_size=2, std_factor=1.0, percentile=30):
    """Segment text adaptively based on cosine similarity of consecutive sentence embeddings."""
    num_sentences = embeddings.shape[0]
    sims = []

    for i in range(1, num_sentences):
        sim = cosine_similarity(
            embeddings[i-1].reshape(1,-1),
            embeddings[i].reshape(1,-1)
        )[0][0]
        sims.append(sim)
    sims = np.array(sims)

    if method == "std":
        threshold = sims.mean() - std_factor * sims.std()
    elif method == "percentile":
        threshold = np.percentile(sims, percentile)
    else:
        raise ValueError("method must be 'std' or 'percentile'")

    pred_segments = [0]
    current_segment = 0
    last_boundary = 0

    for i in range(1, num_sentences):
        sim = sims[i-1]
        if sim < threshold and (i - last_boundary) >= min_size:
            current_segment += 1
            last_boundary = i
        pred_segments.append(current_segment)

    return pred_segments, threshold


# === 2. SIMPLE HELPERS ===
def split_sentences(text):
    """Basic rule-based sentence splitter."""
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

def boundaries_from_segments(segment_indices, num_sentences):
    """Convert list of segment start indices into 0/1 boundary labels."""
    boundaries = [0] * (num_sentences - 1)
    for idx in segment_indices:
        if 0 < idx < num_sentences:
            boundaries[idx - 1] = 1
    return boundaries

def boundary_f1(pred, gold):
    p, r, f, _ = precision_recall_fscore_support(gold, pred, average="binary", zero_division=0)
    return p, r, f

def pk_metric(pred, gold, k=None):
    n = len(pred) + 1
    if k is None:
        k = int(round(n / 2.0))
    errors = 0
    total = 0
    for i in range(0, n - k):
        a = sum(gold[i:i+k-1]) > 0
        b = sum(pred[i:i+k-1]) > 0
        errors += (a != b)
        total += 1
    return errors / total

def windowdiff(pred, gold, k=None):
    n = len(pred) + 1
    if k is None:
        k = int(round(n / 2.0))
    errors = 0
    total = 0
    for i in range(0, n - k):
        a = sum(gold[i:i+k-1])
        b = sum(pred[i:i+k-1])
        errors += (a != b)
        total += 1
    return errors / total


In [4]:
df = pd.read_csv("5.cleaned_synthetic_data.csv")
df["ground_truth_segments"] = df["ground_truth_segments"].apply(lambda x: ast.literal_eval(x))

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")

def evaluate_dataset(df, percentile=30, min_size=2, method="percentile"):
    metrics = []
    for _, row in df.iterrows():
        sentences = split_sentences(row["synthetic_text"])
        if len(sentences) < 3:
            continue

        embeddings = model.encode(sentences, batch_size=8, show_progress_bar=False)
        pred_segments, threshold = adaptive_threshold_segmentation(
            embeddings, method=method, min_size=min_size, percentile=percentile
        )

        pred_boundaries = [1 if pred_segments[i] != pred_segments[i-1] else 0 for i in range(1, len(pred_segments))]
        gt_indices = [idx for idx in row["ground_truth_segments"] if idx != 1]
        gold_boundaries = boundaries_from_segments(gt_indices, len(sentences))

        pred_boundaries = [1 if pred_segments[i] != pred_segments[i-1] else 0 for i in range(1, len(pred_segments))]
        
        p, r, f = boundary_f1(pred_boundaries, gold_boundaries)
        pk = pk_metric(pred_boundaries, gold_boundaries)
        wd = windowdiff(pred_boundaries, gold_boundaries)
        metrics.append((p, r, f, pk, wd))

    arr = np.array(metrics)
    return {
        "Precision": arr[:,0].mean(),
        "Recall": arr[:,1].mean(),
        "F1": arr[:,2].mean(),
        "Pk": arr[:,3].mean(),
        "WindowDiff": arr[:,4].mean(),
    }

In [None]:
for percentile in [20, 30, 40, 50, 60]:
    results = evaluate_dataset(df, percentile=percentile)
   

    print(f"\n=== Percentile = {percentile} ===")
    print(f"DEV   -> F1: {results['F1']:.3f},  Pk: {results['Pk']:.3f},  WD: {results['WindowDiff']:.3f}")


=== Percentile = 20 ===
DEV   -> F1: 0.277,  Pk: 0.246,  WD: 0.250

=== Percentile = 30 ===
DEV   -> F1: 0.275,  Pk: 0.249,  WD: 0.251

=== Percentile = 40 ===
DEV   -> F1: 0.311,  Pk: 0.252,  WD: 0.255

=== Percentile = 50 ===
DEV   -> F1: 0.311,  Pk: 0.252,  WD: 0.255

=== Percentile = 60 ===
DEV   -> F1: 0.251,  Pk: 0.341,  WD: 0.345


_____

In [None]:
def embedding_text_tiling(embeddings, window_size=2, threshold=0.8):
    """
    Simple TextTiling variant using embedding similarity between adjacent windows.
    Returns boundary indices (sentence indices where a topic boundary is predicted).
    """
    num_sentences = embeddings.shape[0]
    boundaries = []

    for i in range(num_sentences - window_size):
        block1 = embeddings[i:i+window_size].mean(axis=0)
        block2 = embeddings[i+1:i+1+window_size].mean(axis=0)
        sim = cosine_similarity(block1.reshape(1, -1), block2.reshape(1, -1))[0][0]
        if sim < threshold:
            boundaries.append(i + window_size - 1)
    return boundaries


model = SentenceTransformer("all-mpnet-base-v2")

def evaluate_texttiling(df, window_sizes=[2,3], thresholds=[0.75,0.9,0.95,0.99]):
    """
    Evaluate embedding-based TextTiling on a dataset of journaling entries.
    Returns average Precision, Recall, F1, Pk, and WindowDiff for each parameter combo.
    """
    results = []

    for window_size in window_sizes:
        for threshold in thresholds:
            metrics = []

            for _, row in df.iterrows():
                sentences = split_sentences(row["synthetic_text"])
                if len(sentences) < window_size + 1:
                    continue

                embeddings = model.encode(sentences, batch_size=8, show_progress_bar=False)

                pred_boundary_indices = embedding_text_tiling(
                    embeddings,
                    window_size=window_size,
                    threshold=threshold
                )

                pred_boundaries = [1 if i in pred_boundary_indices else 0 for i in range(len(sentences) - 1)]

                gt_indices = [idx for idx in row["ground_truth_segments"] if idx != 1]
                gold_boundaries = boundaries_from_segments(gt_indices, len(sentences))

                p, r, f = boundary_f1(pred_boundaries, gold_boundaries)
                pk = pk_metric(pred_boundaries, gold_boundaries)
                wd = windowdiff(pred_boundaries, gold_boundaries)

                metrics.append((p, r, f, pk, wd))

            if not metrics:
                continue

            arr = np.array(metrics)
            results.append({
                "window_size": window_size,
                "threshold": threshold,
                "Precision": arr[:,0].mean(),
                "Recall": arr[:,1].mean(),
                "F1": arr[:,2].mean(),
                "Pk": arr[:,3].mean(),
                "WindowDiff": arr[:,4].mean(),
            })

    return results


results = evaluate_texttiling(df)

for res in results:
    print(f"\n=== window_size={res['window_size']} | threshold={res['threshold']:.2f} ===")
    print(f"F1: {res['F1']:.3f},  Pk: {res['Pk']:.3f},  WD: {res['WindowDiff']:.3f}")


=== window_size=2 | threshold=0.75 ===
F1: 0.424,  Pk: 0.252,  WD: 0.256

=== window_size=2 | threshold=0.90 ===
F1: 0.541,  Pk: 0.360,  WD: 0.364

=== window_size=2 | threshold=0.95 ===
F1: 0.543,  Pk: 0.360,  WD: 0.364

=== window_size=2 | threshold=0.99 ===
F1: 0.543,  Pk: 0.360,  WD: 0.364

=== window_size=3 | threshold=0.75 ===
F1: 0.004,  Pk: 0.149,  WD: 0.149

=== window_size=3 | threshold=0.90 ===
F1: 0.675,  Pk: 0.045,  WD: 0.045

=== window_size=3 | threshold=0.95 ===
F1: 0.705,  Pk: 0.040,  WD: 0.040

=== window_size=3 | threshold=0.99 ===
F1: 0.705,  Pk: 0.040,  WD: 0.040


In [None]:
def sequential_thresholding(embeddings, threshold=0.75):
    num_sentences = embeddings.shape[0]
    pred_segments = [0]  
    current_segment = 0

    for i in range(1, num_sentences):
        sim = cosine_similarity(
            embeddings[i-1].reshape(1,-1), embeddings[i].reshape(1,-1)
        )[0][0]
        if sim < threshold:
            current_segment += 1
        pred_segments.append(current_segment)

    return pred_segments

model = SentenceTransformer("all-mpnet-base-v2")

def evaluate_sequential_thresholding(df, thresholds=[0.2,0.5,0.8]):
    """
    Evaluate the sequential_thresholding segmentation method over a threshold grid.
    Returns metrics for each threshold.
    """
    results = []

    for threshold in thresholds:
        metrics = []

        for _, row in df.iterrows():
            sentences = split_sentences(row["synthetic_text"])
            if len(sentences) < 2:
                continue  

            embeddings = model.encode(sentences, batch_size=8, show_progress_bar=False)

            pred_segments = sequential_thresholding(embeddings, threshold=threshold)

            pred_boundaries = [1 if pred_segments[i] != pred_segments[i-1] else 0 for i in range(1, len(pred_segments))]

            gt_indices = [idx for idx in row["ground_truth_segments"] if idx != 1]
            gold_boundaries = boundaries_from_segments(gt_indices, len(sentences))

            p, r, f = boundary_f1(pred_boundaries, gold_boundaries)
            pk = pk_metric(pred_boundaries, gold_boundaries)
            wd = windowdiff(pred_boundaries, gold_boundaries)

            metrics.append((p, r, f, pk, wd))

        if not metrics:
            continue

        arr = np.array(metrics)
        results.append({
            "threshold": threshold,
            "Precision": arr[:,0].mean(),
            "Recall": arr[:,1].mean(),
            "F1": arr[:,2].mean(),
            "Pk": arr[:,3].mean(),
            "WindowDiff": arr[:,4].mean(),
        })

    return results

results = evaluate_sequential_thresholding(df)

for res in results:
    print(f"\n=== threshold={res['threshold']:.2f} ===")
    print(f"F1: {res['F1']:.3f},  Pk: {res['Pk']:.3f},  WD: {res['WindowDiff']:.3f}")


=== threshold=0.20 ===
F1: 0.128,  Pk: 0.248,  WD: 0.251

=== threshold=0.50 ===
F1: 0.430,  Pk: 0.802,  WD: 0.805

=== threshold=0.80 ===
F1: 0.416,  Pk: 0.869,  WD: 0.873


In [None]:
def adaptive_sequential_thresholding(embeddings, threshold):
    pred_segments = [0]
    current_segment = 0
    segment_vectors = [embeddings[0]]  

    for i in range(1, len(embeddings)):
        centroid = np.mean(segment_vectors, axis=0)
        sim = cosine_similarity(embeddings[i].reshape(1, -1), centroid.reshape(1, -1))[0][0]
        if sim < threshold:
            current_segment += 1
            segment_vectors = [embeddings[i]] 
        else:
            segment_vectors.append(embeddings[i])
        pred_segments.append(current_segment)

    return pred_segments

model = SentenceTransformer("all-mpnet-base-v2")

def evaluate_adaptive_sequential_thresholding(df, thresholds=[0.4,0.6,0.8]):
    """
    Evaluate adaptive_sequential_thresholding over a range of thresholds.
    Returns average metrics for each threshold.
    """
    results = []

    for threshold in thresholds:
        metrics = []

        for _, row in df.iterrows():
            sentences = split_sentences(row["synthetic_text"])
            if len(sentences) < 2:
                continue 

            embeddings = model.encode(sentences, batch_size=8, show_progress_bar=False)

            pred_segments = adaptive_sequential_thresholding(embeddings, threshold=threshold)

            pred_boundaries = [1 if pred_segments[i] != pred_segments[i-1] else 0 for i in range(1, len(pred_segments))]

            gt_indices = [idx for idx in row["ground_truth_segments"] if idx != 1]
            gold_boundaries = boundaries_from_segments(gt_indices, len(sentences))

            p, r, f = boundary_f1(pred_boundaries, gold_boundaries)
            pk = pk_metric(pred_boundaries, gold_boundaries)
            wd = windowdiff(pred_boundaries, gold_boundaries)

            metrics.append((p, r, f, pk, wd))

        if not metrics:
            continue

        arr = np.array(metrics)
        results.append({
            "threshold": threshold,
            "Precision": arr[:,0].mean(),
            "Recall": arr[:,1].mean(),
            "F1": arr[:,2].mean(),
            "Pk": arr[:,3].mean(),
            "WindowDiff": arr[:,4].mean(),
        })

    return results

results = evaluate_adaptive_sequential_thresholding(df)

for res in results:
    print(f"\n=== threshold={res['threshold']:.2f} ===")
    print(f"F1: {res['F1']:.3f},  Pk: {res['Pk']:.3f},  WD: {res['WindowDiff']:.3f}")


=== threshold=0.40 ===
F1: 0.409,  Pk: 0.746,  WD: 0.749

=== threshold=0.60 ===
F1: 0.420,  Pk: 0.851,  WD: 0.854

=== threshold=0.80 ===
F1: 0.416,  Pk: 0.869,  WD: 0.873


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralCoclustering

def bats_segmentation(sentences, n_topics=2, noise_thresh=0.0005, boost_factor=2.0):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    M = vectorizer.fit_transform(sentences).toarray()  
    words = vectorizer.get_feature_names_out()

    col_vars = np.var(M, axis=0)
    keep_mask = col_vars > noise_thresh
    M = M[:, keep_mask]
    col_vars = col_vars[keep_mask]
    words = [w for w, k in zip(words, keep_mask) if k]

    if len(words) == 0:
        raise ValueError("No informative words left after pruning. Try lowering noise_thresh.")

    var_norm = col_vars / (col_vars.max() + 1e-9)
    boost = 1 + (boost_factor - 1) * var_norm
    M = M * boost[np.newaxis, :]

    model = SpectralCoclustering(n_clusters=n_topics, random_state=42)
    model.fit(M)
    sent_labels = model.row_labels_

    boundaries = []
    for i in range(1, len(sent_labels)):
        if sent_labels[i] != sent_labels[i-1]:
            boundaries.append(i-1)

    segments = [0] * len(sentences)
    current = 0
    for i in range(len(sentences)):
        segments[i] = current
        if i in boundaries:
            current += 1

    return segments, boundaries, sent_labels

def evaluate_bats_grid(df, n_topics_list=[2,3], boost_factors=[1.0,1.5,2.0,2.5]):
    results = []

    for n_topics in n_topics_list:
        for boost in boost_factors:
            metrics = []

            for _, row in df.iterrows():
                sentences = split_sentences(row["synthetic_text"])
                if len(sentences) < 2:
                    continue

                try:
                    pred_segments, boundaries, _ = bats_segmentation(
                        sentences, n_topics=n_topics, boost_factor=boost
                    )
                except ValueError:
                    continue

                # Convert predicted segments to 0/1 boundaries
                pred_boundaries = [1 if pred_segments[i] != pred_segments[i-1] else 0 for i in range(1, len(pred_segments))]

                # Ground truth
                gt_indices = [idx for idx in row["ground_truth_segments"] if idx != 1]
                gold_boundaries = boundaries_from_segments(gt_indices, len(sentences))

                # Compute metrics
                p, r, f = boundary_f1(pred_boundaries, gold_boundaries)
                pk = pk_metric(pred_boundaries, gold_boundaries)
                wd = windowdiff(pred_boundaries, gold_boundaries)

                metrics.append((p, r, f, pk, wd))

            if not metrics:
                continue

            arr = np.array(metrics)
            results.append({
                "n_topics": n_topics,
                "boost_factor": boost,
                "Precision": arr[:,0].mean(),
                "Recall": arr[:,1].mean(),
                "F1": arr[:,2].mean(),
                "Pk": arr[:,3].mean(),
                "WindowDiff": arr[:,4].mean(),
            })

    return results


grid_results = evaluate_bats_grid(df)

for res in grid_results:
    print(f"\nn_topics={res['n_topics']}, boost_factor={res['boost_factor']}")
    print(f"F1: {res['F1']:.3f},  Pk: {res['Pk']:.3f},  WD: {res['WindowDiff']:.3f}")


  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag
  row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
  an = row_diag[:, np.newaxis] * X * col_diag



n_topics=2, boost_factor=1.0
F1: 0.238,  Pk: 0.598,  WD: 0.599

n_topics=2, boost_factor=1.5
F1: 0.305,  Pk: 0.481,  WD: 0.484

n_topics=2, boost_factor=2.0
F1: 0.251,  Pk: 0.588,  WD: 0.592

n_topics=2, boost_factor=2.5
F1: 0.220,  Pk: 0.596,  WD: 0.599

n_topics=3, boost_factor=1.0
F1: 0.350,  Pk: 0.791,  WD: 0.795

n_topics=3, boost_factor=1.5
F1: 0.391,  Pk: 0.746,  WD: 0.750

n_topics=3, boost_factor=2.0
F1: 0.402,  Pk: 0.738,  WD: 0.742

n_topics=3, boost_factor=2.5
F1: 0.352,  Pk: 0.809,  WD: 0.812
