In [None]:
## 数据预处理
# https://github.com/fjiangAI/CPTS
def custom_CPTS(from_dir, to_dir):
    import pandas as pd
    import os
    def boundary_to_mass(boundary_list):
        bound_idx = [-1] + [i for i, val in enumerate(boundary_list) if val==1]
        return [ bound_idx[i+1] - bound_idx[i] for i in range(len(bound_idx) - 1) ]
    train_data = pd.read_json(f"{from_dir}/train.json")
    test_data = pd.read_json(f"{from_dir}/test.json")
    df = pd.concat([train_data.assign(split='train'),test_data.assign(split='test')], ignore_index=True) # 合并数据集
    df['sentences'] = df.apply(lambda row: [i['text'] for i in row['paragraph_list']], axis=1)
    df['masses'] = df['label_list'].apply(boundary_to_mass)
    # 保存
    os.makedirs(to_dir, exist_ok=True)
    df[['id','title','topic_list','sentences', 'masses','split','label_list']].to_json(os.path.join(to_dir,'CPTS.jsonl'), orient='records',force_ascii=False, lines=True)

In [None]:
## Configs
ebd_model_path = '../../resources/open_models/bge-large-zh-v1.5'
seq_model_path = '../../resources/open_models/nlp_bert_document-segmentation_chinese-base'
testset_path = "../resources/data/CPTS.jsonl"

In [None]:
## 6中测试指标 https://blog.csdn.net/qq_35082030/article/details/105410478
## 开箱即用库 https://github.com/cfournie/segmentation.evaluation
## example
from segeval.window.pk import pk
from segeval.window.windowdiff import window_diff as WD
from segeval.similarity.boundary import boundary_similarity as B
from segeval.similarity.segmentation import segmentation_similarity as S

gold = [2, 3, 6]
h_list = [[5, 6], [2, 2, 7], [2, 3, 3, 3], [1, 1, 3, 1, 5]]
for n, h in enumerate(h_list):
    print("第%d次实验" % int(n + 1))
    print("1-Pk=%.3f, 1-WD=%.3f, B=%.3f, S=%.3f" % (pk(h, gold, one_minus=True),
            WD(h, gold, one_minus=True, window_size=2), B(h, gold), S(h, gold)))


In [None]:
import pandas as pd 
df = pd.read_json(testset_path, lines=True)
df = df[df['split']=='test'].reset_index()

In [None]:
import re
from tqdm.notebook import tqdm
from segeval.window.pk import pk
from segeval.window.windowdiff import window_diff as WD
from segeval.similarity.boundary import boundary_similarity as B
from segeval.similarity.segmentation import segmentation_similarity as S

import sys; sys.path.append("../..")
from utils.spliters import *

def init_spliter(method):
    if method == "cos_sim_spliter":
        from FlagEmbedding import FlagModel
        model = FlagModel(ebd_model_path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：", use_fp16=True)
        spliter = BaseSpliter.use_subclass("cos_sim_spliter")(model)

    elif method == "doc_seq_model_spliter":
        from transformers import AutoModelForTokenClassification, AutoTokenizer
        model = AutoModelForTokenClassification.from_pretrained(seq_model_path)
        tokenizer = AutoTokenizer.from_pretrained(seq_model_path)
        spliter = BaseSpliter.use_subclass("doc_seq_model_spliter")(model, tokenizer)
    return spliter

In [None]:
subject = "cos_sim_spliter"
spliter = init_spliter(subject)
results = []
for index, row in tqdm(df.iterrows()):
    text = "".join(row['sentences'])
    sentence_df = pd.DataFrame(row['sentences'], columns=['sentence'])
    sentence_df['start_idx'] = sentence_df['sentence'].apply(lambda x: re.search(re.escape(x), text).start())
    sentence_df['end_idx'] = sentence_df['sentence'].apply(lambda x: re.search(re.escape(x), text).end())
    chunk_df = spliter.cluster(sentence_df)
    pred, gt = chunk_df['sentence_count'].tolist(), row['masses']
    pk_score = pk(pred, gt, one_minus=True)
    try:
        wd_score = WD(pred, gt, one_minus=True, window_size=2)
    except Exception as e:
        print(f"WD calculation failed at index {index}: {e}")
        print(f"pred: {pred}, gt: {gt}")
        wd_score = None
    b_score = B(pred, gt)
    s_score = S(pred, gt)
    
    # 将指标保存到结果列表
    results.append({
        "index": index,
        "1-Pk": pk_score,
        "1-WD": wd_score,
        "B": b_score,
        "S": s_score
    })

# 转换结果列表为 DataFrame
results_df = pd.DataFrame(results)

# 计算各指标的平均值
results_df.loc["avg"] = {
    "index": "Average",
    "1-Pk": results_df["1-Pk"].mean(),
    "1-WD": results_df["1-WD"].mean(),
    "B": results_df["B"].mean(),
    "S": results_df["S"].mean()
}

In [None]:
results_df.to_excel(f"results/文档主题分割-{subject}.xlsx")