In [5]:
import re
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [45]:
df = pd.read_csv("journal_diary_with_tags_fixed_segments.csv")
assert "body" in df.columns and "segments" in df.columns
df["segments"] = df["segments"].apply(lambda x: [int(v) for v in str(x).split(",")])


In [None]:
def split_sentences(entry):
    raw = re.split(r'(?<=[.!?])\s+', entry.strip())
    sents = [s.strip() for s in raw if s.strip()]
    return sents

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
df["sentences"] = df["body"].apply(split_sentences)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
for idx, row in test_df.iterrows():
    sentences = row["sentences"]
    gold = row["segments"]

    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} sentences but {len(gold)} labels")
        continue

In [49]:

train_sentences = [s for entry in train_df["sentences"] for s in entry]
train_embeddings = model.encode(train_sentences, show_progress_bar=True)

Batches: 100%|██████████| 132/132 [38:35<00:00, 17.54s/it]   


In [50]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=3,
    min_samples=5,
    metric='euclidean',
    cluster_selection_method='eom',
     prediction_data=True  
)
clusterer.fit(train_embeddings)

0,1,2
,min_cluster_size,3
,min_samples,5
,cluster_selection_epsilon,0.0
,max_cluster_size,0
,metric,'euclidean'
,alpha,1.0
,p,
,algorithm,'best'
,leaf_size,40
,memory,Memory(location=None)


In [51]:
def segment_by_topic(sentences, clusters):
    segments = []
    current_cluster = clusters[0]
    current_segment = [sentences[0]]

    for i in range(1, len(sentences)):
        if clusters[i] == current_cluster:
            current_segment.append(sentences[i])
        else:
            segments.append((current_cluster, current_segment))
            current_cluster = clusters[i]
            current_segment = [sentences[i]]

    segments.append((current_cluster, current_segment))
    return segments

In [52]:
def clusters_to_boundaries(cluster_labels):
    boundaries = [1] 
    for i in range(1, len(cluster_labels)):
        if cluster_labels[i] != cluster_labels[i-1]:
            boundaries.append(1)
        else:
            boundaries.append(0)
    return boundaries

In [None]:
def safe_window_size(ref):
    n = len(ref)
    true_segments = sum(ref)
    
    if n < 2:
        return 1
    
    if true_segments <= 1:
        return max(1, n // 2)

    avg_seg_len = n / true_segments
    k = int(avg_seg_len)

    k = max(1, min(k, n - 1))
    return k


def pk(ref, hyp):
    n = len(ref)
    k = safe_window_size(ref)

    total = n - k
    if total <= 0:
        return 0.0  

    errors = 0
    for i in range(total):
        ref_same = (ref[i] == ref[i+k])
        hyp_same = (hyp[i] == hyp[i+k])
        if ref_same != hyp_same:
            errors += 1

    return errors / total


def windowdiff(ref, hyp):
    n = len(ref)
    k = safe_window_size(ref)

    total = n - k
    if total <= 0:
        return 0.0

    errors = 0
    for i in range(total):
        ref_count = sum(ref[i:i+k])
        hyp_count = sum(hyp[i:i+k])
        if ref_count != hyp_count:
            errors += 1

    return errors / total


In [None]:
from hdbscan import approximate_predict

results = []

for idx, row in test_df.iterrows():

    sentences = row["sentences"]
    gold = row["segments"]

    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} sentences but {len(gold)} labels")
        continue

    test_emb = model.encode(sentences)

    clusters, strengths = approximate_predict(clusterer, test_emb)

    if len(clusters) != len(sentences):
        print(f"[SKIP] Entry {idx}: predicted {len(clusters)} clusters but {len(sentences)} sentences")
        continue

    pred = clusters_to_boundaries(clusters)

    if len(pred) != len(gold):
        print(f"[SKIP] Entry {idx}: pred {len(pred)} vs gold {len(gold)}")
        continue


    f1 = f1_score(gold, pred, average="binary")
    pk_score = pk(gold, pred)
    wd_score = windowdiff(gold, pred)

    results.append({
        "entry_id": idx,
        "f1": f1,
        "pk": pk_score,
        "windowdiff": wd_score
    })


In [55]:
eval_df = pd.DataFrame(results)
eval_df

print("\n=== Average Scores ===")
print("F1:", eval_df["f1"].mean())
print("Pk:", eval_df["pk"].mean())
print("WindowDiff:", eval_df["windowdiff"].mean())



=== Average Scores ===
F1: 0.48851746427020304
Pk: 0.4777206525422305
WindowDiff: 0.6376138949878423


In [None]:
def embedding_text_tiling(sentences, embeddings, window_size=2, threshold=0.7):
  
    num_sentences = embeddings.shape[0]
    boundaries = []

    for i in range(num_sentences - window_size):
        block1 = embeddings[i:i+window_size].mean(axis=0)
        block2 = embeddings[i+1:i+1+window_size].mean(axis=0)
        sim = cosine_similarity(block1.reshape(1,-1), block2.reshape(1,-1))[0][0]
        if sim < threshold:
            boundaries.append(i + window_size - 1)

    segments = []
    start = 0
    for b in boundaries:
        segments.append(sentences[start:b+1])
        start = b+1
    segments.append(sentences[start:])  
    return segments, boundaries  


In [57]:
def boundaries_to_binary(boundaries, num_sentences):
    labels = [0] * num_sentences
    labels[0] = 1
    for b in boundaries:
        if b+1 < num_sentences:
            labels[b+1] = 1
    return labels


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

results = []

for idx, row in test_df.iterrows():
    
    sentences = row["sentences"]
    gold = row["segments"]


    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} sentences but {len(gold)} labels")
        continue

    embeddings = model.encode(sentences)

    segments, boundaries = embedding_text_tiling(
        sentences,
        embeddings,
        window_size=2,
        threshold=0.7
    )

    pred = boundaries_to_binary(boundaries, len(sentences))


    if len(pred) != len(gold):
        print(f"[SKIP] Entry {idx}: pred length {len(pred)} vs gold length {len(gold)}")
        continue
    
    f1 = f1_score(gold, pred, average="binary")
    pk_score = pk(gold, pred)
    wd_score = windowdiff(gold, pred)

    results.append({
        "entry_id": idx,
        "f1": f1,
        "pk": pk_score,
        "windowdiff": wd_score
    })


In [59]:
eval_df = pd.DataFrame(results)
print(eval_df)

print("\n=== Averages ===")
print("F1:", eval_df["f1"].mean())
print("Pk:", eval_df["pk"].mean())
print("WindowDiff:", eval_df["windowdiff"].mean())


     entry_id        f1        pk  windowdiff
0         280  0.666667  0.500000    0.000000
1          78  0.307692  0.416667    1.000000
2         113  0.500000  0.250000    0.750000
3         253  0.363636  0.454545    0.727273
4         324  0.125000  0.285714    1.000000
..        ...       ...       ...         ...
126       290  0.222222  0.500000    0.666667
127       395  0.666667  0.250000    0.000000
128       108  0.285714  0.250000    1.000000
129       353  0.400000  0.800000    1.000000
130       406  0.727273  0.500000    0.333333

[131 rows x 4 columns]

=== Averages ===
F1: 0.4946929370138549
Pk: 0.4513549989226098
WindowDiff: 0.5881061786612243


In [None]:
def adaptive_threshold_segmentation(
    sentences,
    embeddings,
    method="percentile",
    min_size=2,
    std_factor=1.0,
    percentile=30
):
    num_sentences = embeddings.shape[0]
    sims = []

    for i in range(1, num_sentences):
        sim = cosine_similarity(
            embeddings[i-1].reshape(1,-1),
            embeddings[i].reshape(1,-1)
        )[0][0]
        sims.append(sim)

    sims = np.array(sims)

    if method == "std":
        threshold = sims.mean() - std_factor * sims.std()
    elif method == "percentile":
        threshold = np.percentile(sims, percentile)
    else:
        raise ValueError("method must be 'std' or 'percentile'")

    segments = []
    boundaries = []   
    current_segment = [sentences[0]]
    last_boundary = 0

    for i in range(1, num_sentences):
        sim = cosine_similarity(
            embeddings[i-1].reshape(1,-1),
            embeddings[i].reshape(1,-1)
        )[0][0]

        if sim < threshold and (i - last_boundary) >= min_size:
            segments.append(current_segment)
            boundaries.append(i - 1)  
            current_segment = [sentences[i]]
            last_boundary = i
        else:
            current_segment.append(sentences[i])

    segments.append(current_segment)

    return segments, boundaries, threshold


In [61]:
def boundaries_to_binary(boundaries, num_sentences):
    labels = [0] * num_sentences
    labels[0] = 1
    for b in boundaries:
        if b + 1 < num_sentences:
            labels[b + 1] = 1
    return labels


In [None]:
for idx, row in test_df.iterrows():

    sentences = row["sentences"]
    gold = row["segments"]

    
    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} sentences but {len(gold)} labels")
        continue

    embeddings = model.encode(sentences)

   
    segments, boundaries, threshold = adaptive_threshold_segmentation(
        sentences,
        embeddings,
        method="std",
        min_size=2
    )

    pred = boundaries_to_binary(boundaries, len(sentences))

    if len(pred) != len(gold):
        print(f"[SKIP] Entry {idx}: pred length {len(pred)} vs gold {len(gold)}")
        continue

    
    f1 = f1_score(gold, pred, average="binary")
    pk_score = pk(gold, pred)
    wd_score = windowdiff(gold, pred)

    results.append({
        "entry_id": idx,
        "threshold": threshold,
        "f1": f1,
        "pk": pk_score,
        "windowdiff": wd_score
    })


  threshold = sims.mean() - std_factor * sims.std()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [63]:
eval_df = pd.DataFrame(results)
print(eval_df)

print("\n=== Averages ===")
print("F1:", eval_df["f1"].mean())
print("Pk:", eval_df["pk"].mean())
print("WindowDiff:", eval_df["windowdiff"].mean())


     entry_id        f1        pk  windowdiff  threshold
0         280  0.666667  0.500000    0.000000        NaN
1          78  0.307692  0.416667    1.000000        NaN
2         113  0.500000  0.250000    0.750000        NaN
3         253  0.363636  0.454545    0.727273        NaN
4         324  0.125000  0.285714    1.000000        NaN
..        ...       ...       ...         ...        ...
257       290  0.400000  0.166667    1.000000   0.137281
258       395  0.666667  0.250000    0.000000   0.186970
259       108  0.666667  0.250000    0.750000   0.156616
260       353  0.666667  0.400000    0.800000   0.214291
261       406  0.500000  0.500000    0.500000   0.129297

[262 rows x 5 columns]

=== Averages ===
F1: 0.5267694628154179
Pk: 0.4086744268352946
WindowDiff: 0.533017794250857


In [64]:
def run_hdbscan_segmentation(sentences, embeddings):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=2,
        min_samples=1,
        metric='euclidean',
        cluster_selection_method='eom',
        cluster_selection_epsilon=0.5
    )

    clusters = clusterer.fit_predict(embeddings)
    return clusters


In [None]:
def segment_by_topic(sentences, clusters):
    segments = []
    boundaries = []

    current_cluster = clusters[0]
    current_segment = [sentences[0]]

    for i in range(1, len(sentences)):
        if clusters[i] == current_cluster:
            current_segment.append(sentences[i])
        else:
            segments.append((current_cluster, current_segment))
            boundaries.append(i - 1)    
            current_cluster = clusters[i]
            current_segment = [sentences[i]]

    segments.append((current_cluster, current_segment))
    return segments, boundaries


In [66]:
def merge_small_segments(segments):
    merged = []
    for i, (cluster_id, seg) in enumerate(segments):
        if len(seg) == 1:
            if i == 0:
                next_cluster, next_seg = segments[i+1]
                merged_seg = seg + next_seg
                merged.append((next_cluster, merged_seg))
                segments[i+1] = (next_cluster, merged_seg)
            else:
                prev_cluster, prev_seg = merged[-1]
                prev_seg.extend(seg)
                merged[-1] = (prev_cluster, prev_seg)
        else:
            merged.append((cluster_id, seg))
    return merged


In [67]:
def boundaries_to_binary(boundaries, num_sentences):
    labels = [0] * num_sentences
    labels[0] = 1
    for b in boundaries:
        if 0 <= b+1 < num_sentences:
            labels[b+1] = 1
    return labels


In [68]:
import warnings
warnings.filterwarnings("ignore", message=".*force_all_finite.*", category=FutureWarning)


In [None]:
for idx, row in test_df.iterrows():

    sentences = row["sentences"]
    gold = row["segments"]

    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} sentences but {len(gold)} labels")
        continue

    embeddings = model.encode(sentences)

    clusters = run_hdbscan_segmentation(sentences, embeddings)

    segments_raw, boundaries = segment_by_topic(sentences, clusters)

    segments_merged = merge_small_segments(segments_raw)

    pred = boundaries_to_binary(boundaries, len(sentences))

    if len(pred) != len(gold):
        print(f"[SKIP] Entry {idx}: pred length {len(pred)} vs gold length {len(gold)}")
        continue

    f1 = f1_score(gold, pred, average="binary")
    pk_score = pk(gold, pred)
    wd_score = windowdiff(gold, pred)

    results.append({
        "entry_id": idx,
        "f1": f1,
        "pk": pk_score,
        "windowdiff": wd_score
    })


ValueError: k must be less than or equal to the number of training points

In [None]:
eval_df = pd.DataFrame(results)
print(eval_df)

print("\n=== Averages ===")
print("F1:", eval_df["f1"].mean())
print("Pk:", eval_df["pk"].mean())
print("WindowDiff:", eval_df["windowdiff"].mean())


     entry_id        f1        pk  windowdiff  threshold
0         280  0.666667  0.500000    0.000000        NaN
1          78  0.307692  0.416667    1.000000        NaN
2         113  0.500000  0.250000    0.750000        NaN
3         253  0.363636  0.454545    0.727273        NaN
4         324  0.125000  0.285714    1.000000        NaN
..        ...       ...       ...         ...        ...
337       290  0.500000  0.333333    1.000000        NaN
338       395  1.000000  0.000000    0.000000        NaN
339       108  0.500000  0.500000    1.000000        NaN
340       353  0.285714  0.800000    1.000000        NaN
341       406  0.333333  0.666667    0.666667        NaN

[342 rows x 5 columns]

=== Averages ===
F1: 0.5290087835795693
Pk: 0.4233263944612927
WindowDiff: 0.5625216321393128


In [None]:
def build_pair_dataset(df, model):
    X = []
    y = []

    for idx, row in df.iterrows():
        sentences = row["sentences"]
        gold = row["segments"]

        if len(sentences) != len(gold):
            continue

        embeddings = model.encode(sentences)

        for i in range(len(sentences) - 1):
            e1 = embeddings[i]
            e2 = embeddings[i+1]

        
            feat = np.concatenate([
                e1,                   
                e2,                     
                np.abs(e1 - e2),        
                e1 * e2                 
            ])

            X.append(feat)
            y.append(gold[i+1])  

    return np.array(X), np.array(y)


In [71]:
X_train, y_train = build_pair_dataset(train_df, model)


In [72]:
from xgboost import XGBClassifier

clf = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1
)

clf.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
results = []

for idx, row in test_df.iterrows():

    sentences = row["sentences"]
    gold = row["segments"]

   
    if len(sentences) != len(gold):
        print(f"[SKIP] Entry {idx}: {len(sentences)} vs {len(gold)}")
        continue

  
    embeddings = model.encode(sentences)

    preds = [1]  

    for i in range(len(sentences) - 1):
        e1 = embeddings[i]
        e2 = embeddings[i+1]

        feat = np.concatenate([
            e1,
            e2,
            np.abs(e1 - e2),
            e1 * e2
        ]).reshape(1, -1)

        boundary = clf.predict(feat)[0]
        preds.append(boundary)

    pred = preds


    if len(pred) != len(gold):
        print(f"[SKIP] Entry {idx}: pred {len(pred)} vs gold {len(gold)}")
        continue

    f1 = f1_score(gold, pred, average="binary")
    pk_score = pk(gold, pred)
    wd_score = windowdiff(gold, pred)

    results.append({
        "entry_id": idx,
        "f1": f1,
        "pk": pk_score,
        "windowdiff": wd_score
    })


In [74]:
eval_df = pd.DataFrame(results)
print(eval_df)

print("\n=== Averages ===")
print("F1:", eval_df["f1"].mean())
print("Pk:", eval_df["pk"].mean())
print("WindowDiff:", eval_df["windowdiff"].mean())


     entry_id        f1        pk  windowdiff
0         280  0.666667  0.500000    0.000000
1          78  0.500000  0.166667    0.666667
2         113  0.500000  0.375000    0.750000
3         253  0.400000  0.363636    0.545455
4         324  0.500000  0.071429    0.428571
..        ...       ...       ...         ...
126       290  0.666667  0.166667    0.333333
127       395  1.000000  0.000000    0.000000
128       108  1.000000  0.000000    0.000000
129       353  1.000000  0.000000    0.000000
130       406  0.333333  0.666667    0.666667

[131 rows x 4 columns]

=== Averages ===
F1: 0.656805151903585
Pk: 0.2603706523061146
WindowDiff: 0.3742658711002914
