In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from nltk.metrics.segmentation import pk, windowdiff
import re

def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

def segment_starts_to_boundaries(starts, n_sentences):
    """
    Convert 1-indexed sentence starts to boundary vector
    """
    starts_0idx = {i - 1 for i in starts}
    return [1 if i in starts_0idx else 0 for i in range(n_sentences)]

def adaptive_threshold_segmentation(
    embeddings,
    percentile=30,
    min_size=2
):
    sims = np.array([
        cosine_similarity(
            embeddings[i - 1].reshape(1, -1),
            embeddings[i].reshape(1, -1)
        )[0][0]
        for i in range(1, len(embeddings))
    ])

    threshold = np.percentile(sims, percentile)

    boundaries = [1]  
    last_boundary = 0

    for i in range(1, len(embeddings)):
        sim = cosine_similarity(
            embeddings[i - 1].reshape(1, -1),
            embeddings[i].reshape(1, -1)
        )[0][0]

        if sim < threshold and (i - last_boundary) >= min_size:
            boundaries.append(1)
            last_boundary = i
        else:
            boundaries.append(0)

    return boundaries

df = pd.read_csv("5.cleaned_synthetic_data.csv")

model = SentenceTransformer("all-mpnet-base-v2")

all_f1 = []
all_pk = []
all_wd = []

for _, row in df.iterrows():
    text = row["synthetic_text"]
    gt_starts = eval(row["ground_truth_segments"])

    sentences = split_into_sentences(text)
    n = len(sentences)
    if n < 2:
        continue

    embeddings = model.encode(sentences)

    gold_boundaries = segment_starts_to_boundaries(gt_starts, n)
    pred_boundaries = adaptive_threshold_segmentation(embeddings)

    gold_boundaries_int = [int(b) for b in gold_boundaries]
    pred_boundaries_int = [int(b) for b in pred_boundaries]

    f1 = f1_score(gold_boundaries_int, pred_boundaries_int)

    gold_str = ''.join(str(b) for b in gold_boundaries)
    pred_str = ''.join(str(b) for b in pred_boundaries)

    n_segments = max(len(gt_starts), 1)
    avg_seg_len = max(2, int(n / n_segments))
    avg_seg_len = min(avg_seg_len, n - 1)

    pk_score = pk(gold_str, pred_str, k=avg_seg_len)
    wd_score = windowdiff(gold_str, pred_str, k=avg_seg_len)

    all_f1.append(f1)
    all_pk.append(pk_score)
    all_wd.append(wd_score)

print("\n--- Segmentation Evaluation ---")
print(f"Mean F1: {np.mean(all_f1):.4f}")
print(f"Mean Pk: {np.mean(all_pk):.4f}")
print(f"Mean WindowDiff: {np.mean(all_wd):.4f}")
print(f"Documents evaluated: {len(all_f1)}")


--- Segmentation Evaluation ---
Mean F1: 0.6791
Mean Pk: 0.3531
Mean WindowDiff: 0.4671
Documents evaluated: 278


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from nltk.metrics.segmentation import pk, windowdiff
import re

def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

def adaptive_threshold_segmentation(
    embeddings,
    percentile=30,
    min_size=2
):
    sims = np.array([
        cosine_similarity(
            embeddings[i - 1].reshape(1, -1),
            embeddings[i].reshape(1, -1)
        )[0][0]
        for i in range(1, len(embeddings))
    ])

    threshold = np.percentile(sims, percentile)

    boundaries = [1]  
    last_boundary = 0

    for i in range(1, len(embeddings)):
        sim = cosine_similarity(
            embeddings[i - 1].reshape(1, -1),
            embeddings[i].reshape(1, -1)
        )[0][0]

        if sim < threshold and (i - last_boundary) >= min_size:
            boundaries.append(1)
            last_boundary = i
        else:
            boundaries.append(0)

    return boundaries

df = pd.read_csv("journal_diary_with_tags_fixed_segments.csv")

model = SentenceTransformer("all-mpnet-base-v2")

all_f1 = []
all_pk = []
all_wd = []

for _, row in df.iterrows():
    text = row["body"]
    segments_str = row["segments"]
    
    sentences = split_into_sentences(text)
    n = len(sentences)
    if n < 2:
        continue

    segments_str = segments_str.strip()
    if segments_str.startswith('['):
        gold_boundaries = eval(segments_str)
    else:
        gold_boundaries = [int(x.strip()) for x in segments_str.split(',')]
    
    if len(gold_boundaries) != n:
        print(f"Skipping row: gold boundaries ({len(gold_boundaries)}) != sentences ({n})")
        continue

    embeddings = model.encode(sentences)
    pred_boundaries = adaptive_threshold_segmentation(embeddings)

    gold_boundaries_int = [int(b) for b in gold_boundaries]
    pred_boundaries_int = [int(b) for b in pred_boundaries]

    f1 = f1_score(gold_boundaries_int, pred_boundaries_int)

    gold_str = ''.join(str(b) for b in gold_boundaries)
    pred_str = ''.join(str(b) for b in pred_boundaries)

    n_segments = max(sum(gold_boundaries), 1)
    avg_seg_len = max(2, int(n / n_segments))
    avg_seg_len = min(avg_seg_len, n - 1)

    pk_score = pk(gold_str, pred_str, k=avg_seg_len)
    wd_score = windowdiff(gold_str, pred_str, k=avg_seg_len)

    all_f1.append(f1)
    all_pk.append(pk_score)
    all_wd.append(wd_score)

print("\n--- Segmentation Evaluation ---")
print(f"Mean F1: {np.mean(all_f1):.4f}")
print(f"Mean Pk: {np.mean(all_pk):.4f}")
print(f"Mean WindowDiff: {np.mean(all_wd):.4f}")
print(f"Documents evaluated: {len(all_f1)}")


--- Segmentation Evaluation ---
Mean F1: 0.5102
Mean Pk: 0.4238
Mean WindowDiff: 0.6311
Documents evaluated: 432
