In [1]:
from evaluate import load
import csv
import pandas as pd
import numpy as np

# 文件路径
input_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output.csv'
output_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output_bert_score_multi_models_fixed.csv'

# 加载BERTScore评估器
bertscore = load("bertscore")

# 定义要使用的不同模型
models = [
    "distilbert-base-uncased",  # 原来使用的模型,       
    "xlm-roberta-large",             
    "roberta-large",          # 如果计算资源允许，可以取消注释使用这个更大的模型
]

def clean_text_for_csv(text):
    """清理文本，去除可能导致CSV解析问题的字符"""
    if not isinstance(text, str):
        return str(text)
    # 替换换行符为空格
    text = text.replace('\n', ' ')
    return text

# 使用pandas读取输入CSV文件
try:
    df = pd.read_csv(input_file)
    print(f"成功读取CSV文件，共有{len(df)}行")
    print(f"列名: {df.columns.tolist()}")
    
    # 为每个模型和每个输出创建新列
    for model_name in models:
        model_short_name = model_name.split("-")[0]  # 简化的模型名称
        for i in range(1, 7):
            df[f"{model_short_name}_output{i}_precision"] = np.nan
            df[f"{model_short_name}_output{i}_recall"] = np.nan
            df[f"{model_short_name}_output{i}_f1"] = np.nan
        # 为每个模型添加平均值列
        df[f"{model_short_name}_avg_precision"] = np.nan
        df[f"{model_short_name}_avg_recall"] = np.nan
        df[f"{model_short_name}_avg_f1"] = np.nan
    
    # 为每一行计算BERTScore
    for idx, row in df.iterrows():
        print(f"正在处理第 {idx+1}/{len(df)} 行...")
        reference = [clean_text_for_csv(row.iloc[1])]  # 假设参考文本在第2列
        
        # 为每个模型计算分数
        for model_name in models:
            model_short_name = model_name.split("-")[0]
            print(f"\n使用模型 {model_name} 计算BERTScore:")
            
            precision_values = []
            recall_values = []
            f1_values = []
            
            # 为每个输出计算BERTScore
            for i in range(1, 9):
                output_col_idx = i + 1  # 假设输出从第3列开始
                if output_col_idx < len(row):
                    output_text = [clean_text_for_csv(row.iloc[output_col_idx])]
                    
                    try:
                        results = bertscore.compute(
                            predictions=output_text, 
                            references=reference, 
                            model_type=model_name
                        )
                        
                        precision = results["precision"][0]
                        recall = results["recall"][0]
                        f1 = results["f1"][0]
                        
                        # 存储结果
                        df.at[idx, f"{model_short_name}_output{i}_precision"] = precision
                        df.at[idx, f"{model_short_name}_output{i}_recall"] = recall
                        df.at[idx, f"{model_short_name}_output{i}_f1"] = f1
                        
                        precision_values.append(precision)
                        recall_values.append(recall)
                        f1_values.append(f1)
                        
                        print(f"  Output{i} - P: {precision:.4f}, R: {recall:.4f}, F1: {f1:.4f}")
                    except Exception as e:
                        print(f"  计算Output{i}的BERTScore时出错: {e}")
                else:
                    print(f"  警告: 没有找到Output{i}的列")
            
            # 计算平均值
            if precision_values:
                avg_precision = np.mean(precision_values)
                avg_recall = np.mean(recall_values)
                avg_f1 = np.mean(f1_values)
                
                df.at[idx, f"{model_short_name}_avg_precision"] = avg_precision
                df.at[idx, f"{model_short_name}_avg_recall"] = avg_recall
                df.at[idx, f"{model_short_name}_avg_f1"] = avg_f1
                
                print(f"平均值 - P: {avg_precision:.4f}, R: {avg_recall:.4f}, F1: {avg_f1:.4f}")
            else:
                print("无法计算平均值，没有有效结果")
            
            print("-" * 50)
    
    # 保存结果到CSV文件
    df.to_csv(output_file, index=False, encoding='utf-8', quoting=csv.QUOTE_ALL)
    print(f"处理完成，结果已保存至 {output_file}")
    
except Exception as e:
    print(f"处理过程中出错: {e}")

成功读取CSV文件，共有29行
列名: ['Question', 'Label', 'MindMap', 'GPT3.5', 'BM25_retrieval', 'Embedding_retrieval', 'KG_retrieval', 'KG_self-consistency', 'KG_TreeOfThoughts', 'GPT4']
正在处理第 1/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:


  attn_output = torch.nn.functional.scaled_dot_product_attention(


  Output1 - P: 0.6351, R: 0.8641, F1: 0.7321
  Output2 - P: 0.8464, R: 0.8252, F1: 0.8356
  Output3 - P: 0.7548, R: 0.8423, F1: 0.7961
  Output4 - P: 0.7125, R: 0.8395, F1: 0.7708
  Output5 - P: 0.7009, R: 0.8173, F1: 0.7546
  Output6 - P: 0.8249, R: 0.8590, F1: 0.8416
  Output7 - P: 0.6582, R: 0.7995, F1: 0.7220
  Output8 - P: 0.7787, R: 0.8053, F1: 0.7918
平均值 - P: 0.7390, R: 0.8315, F1: 0.7806
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Output1 - P: 0.7770, R: 0.9210, F1: 0.8429
  Output2 - P: 0.8979, R: 0.8895, F1: 0.8937
  Output3 - P: 0.8455, R: 0.8983, F1: 0.8711
  Output4 - P: 0.8348, R: 0.8902, F1: 0.8616
  Output5 - P: 0.8224, R: 0.8856, F1: 0.8528
  Output6 - P: 0.8839, R: 0.9111, F1: 0.8973
  Output7 - P: 0.8002, R: 0.8780, F1: 0.8373
  Output8 - P: 0.8704, R: 0.8771, F1: 0.8737
平均值 - P: 0.8415, R: 0.8939, F1: 0.8663
--------------------------------------------------

使用模型 roberta-large 计算BERTScore:


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7504, R: 0.9041, F1: 0.8201
  Output2 - P: 0.8982, R: 0.8736, F1: 0.8857
  Output3 - P: 0.8329, R: 0.8870, F1: 0.8591
  Output4 - P: 0.8207, R: 0.8768, F1: 0.8479
  Output5 - P: 0.8079, R: 0.8615, F1: 0.8338
  Output6 - P: 0.8585, R: 0.8967, F1: 0.8772
  Output7 - P: 0.7770, R: 0.8511, F1: 0.8124
  Output8 - P: 0.8675, R: 0.8617, F1: 0.8645
平均值 - P: 0.8266, R: 0.8766, F1: 0.8501
--------------------------------------------------
正在处理第 2/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6710, R: 0.8355, F1: 0.7443
  Output2 - P: 0.7827, R: 0.8555, F1: 0.8175
  Output3 - P: 0.7066, R: 0.8188, F1: 0.7585
  Output4 - P: 0.7316, R: 0.8340, F1: 0.7795
  Output5 - P: 0.7148, R: 0.8276, F1: 0.7670
  Output6 - P: 0.8056, R: 0.8684, F1: 0.8358
  Output7 - P: 0.6819, R: 0.8297, F1: 0.7486
  Output8 - P: 0.7059, R: 0.8329, F1: 0.7641
平均值 - P: 0.7250, R: 0.8378, F1: 0.7769
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7475, R: 0.8629, F1: 0.8011
  Output2 - P: 0.8580, R: 0.8761, F1: 0.8669
  Output3 - P: 0.8135, R: 0.8603, F1: 0.8362
  Output4 - P: 0.8282, R: 0.8679, F1: 0.8476
  Output5 - P: 0.8142, R: 0.8710, F1: 0.8416
  Output6 - P: 0.8594, R: 0.8929, F1: 0.8758
  Output7 - P: 0.7835, R: 0.8575, F1: 0.8189
  Output8 - P: 0.8181, R: 0.8658, F1: 0.8413
平均值 - P: 0.8153, R: 0.8693, F1: 0.8412
--------------------------------------------------
正在处理第 3/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5546, R: 0.7444, F1: 0.6356
  Output2 - P: 0.7450, R: 0.8217, F1: 0.7815
  Output3 - P: 0.6872, R: 0.7633, F1: 0.7233
  Output4 - P: 0.7882, R: 0.8035, F1: 0.7958
  Output5 - P: 0.6372, R: 0.7855, F1: 0.7037
  Output6 - P: 0.7379, R: 0.7797, F1: 0.7582
  Output7 - P: 0.5910, R: 0.7614, F1: 0.6655
  Output8 - P: 0.6688, R: 0.8162, F1: 0.7352
平均值 - P: 0.6762, R: 0.7845, F1: 0.7248
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7133, R: 0.8622, F1: 0.7807
  Output2 - P: 0.8424, R: 0.8862, F1: 0.8638
  Output3 - P: 0.8103, R: 0.8618, F1: 0.8352
  Output4 - P: 0.8675, R: 0.8818, F1: 0.8746
  Output5 - P: 0.7599, R: 0.8715, F1: 0.8119
  Output6 - P: 0.8374, R: 0.8854, F1: 0.8607
  Output7 - P: 0.7454, R: 0.8553, F1: 0.7966
  Output8 - P: 0.7925, R: 0.8736, F1: 0.8310
平均值 - P: 0.7961, R: 0.8722, F1: 0.8318
--------------------------------------------------
正在处理第 4/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5181, R: 0.7842, F1: 0.6240
  Output2 - P: 0.6313, R: 0.7608, F1: 0.6900
  Output3 - P: 0.6204, R: 0.7688, F1: 0.6867
  Output4 - P: 0.6574, R: 0.7660, F1: 0.7075
  Output5 - P: 0.6411, R: 0.7502, F1: 0.6913
  Output6 - P: 0.6791, R: 0.7717, F1: 0.7224
  Output7 - P: 0.5584, R: 0.7603, F1: 0.6439
  Output8 - P: 0.6006, R: 0.7600, F1: 0.6709
平均值 - P: 0.6133, R: 0.7653, F1: 0.6796
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7078, R: 0.8868, F1: 0.7872
  Output2 - P: 0.7858, R: 0.8432, F1: 0.8135
  Output3 - P: 0.7762, R: 0.8392, F1: 0.8065
  Output4 - P: 0.7973, R: 0.8453, F1: 0.8206
  Output5 - P: 0.7853, R: 0.8508, F1: 0.8168
  Output6 - P: 0.8162, R: 0.8556, F1: 0.8354
  Output7 - P: 0.7358, R: 0.8672, F1: 0.7962
  Output8 - P: 0.7671, R: 0.8465, F1: 0.8048
平均值 - P: 0.7714, R: 0.8543, F1: 0.8101
--------------------------------------------------
正在处理第 5/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6556, R: 0.8056, F1: 0.7229
  Output2 - P: 0.7416, R: 0.7816, F1: 0.7610
  Output3 - P: 0.7137, R: 0.8009, F1: 0.7548
  Output4 - P: 0.7320, R: 0.7980, F1: 0.7636
  Output5 - P: 0.7054, R: 0.8122, F1: 0.7551
  Output6 - P: 0.8109, R: 0.8472, F1: 0.8287
  Output7 - P: 0.6748, R: 0.8152, F1: 0.7384
  Output8 - P: 0.7048, R: 0.7908, F1: 0.7453
平均值 - P: 0.7174, R: 0.8064, F1: 0.7587
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7312, R: 0.8480, F1: 0.7853
  Output2 - P: 0.8232, R: 0.8383, F1: 0.8307
  Output3 - P: 0.8020, R: 0.8489, F1: 0.8248
  Output4 - P: 0.8027, R: 0.8387, F1: 0.8203
  Output5 - P: 0.7970, R: 0.8464, F1: 0.8209
  Output6 - P: 0.8355, R: 0.8827, F1: 0.8584
  Output7 - P: 0.7791, R: 0.8467, F1: 0.8115
  Output8 - P: 0.8057, R: 0.8357, F1: 0.8204
平均值 - P: 0.7970, R: 0.8482, F1: 0.8215
--------------------------------------------------
正在处理第 6/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6605, R: 0.7947, F1: 0.7214
  Output2 - P: 0.8380, R: 0.8114, F1: 0.8245
  Output3 - P: 0.7516, R: 0.8293, F1: 0.7885
  Output4 - P: 0.7387, R: 0.8198, F1: 0.7771
  Output5 - P: 0.7056, R: 0.8207, F1: 0.7588
  Output6 - P: 0.8247, R: 0.8007, F1: 0.8125
  Output7 - P: 0.6941, R: 0.7915, F1: 0.7396
  Output8 - P: 0.7662, R: 0.8127, F1: 0.7888
平均值 - P: 0.7474, R: 0.8101, F1: 0.7764
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7384, R: 0.8678, F1: 0.7979
  Output2 - P: 0.8896, R: 0.8706, F1: 0.8800
  Output3 - P: 0.8151, R: 0.8699, F1: 0.8416
  Output4 - P: 0.8088, R: 0.8655, F1: 0.8362
  Output5 - P: 0.7730, R: 0.8678, F1: 0.8176
  Output6 - P: 0.8547, R: 0.8713, F1: 0.8629
  Output7 - P: 0.7866, R: 0.8498, F1: 0.8170
  Output8 - P: 0.8400, R: 0.8640, F1: 0.8518
平均值 - P: 0.8133, R: 0.8658, F1: 0.8381
--------------------------------------------------
正在处理第 7/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6739, R: 0.8495, F1: 0.7516
  Output2 - P: 0.7698, R: 0.8213, F1: 0.7947
  Output3 - P: 0.7303, R: 0.8058, F1: 0.7662
  Output4 - P: 0.7453, R: 0.8288, F1: 0.7848
  Output5 - P: 0.7243, R: 0.8371, F1: 0.7766
  Output6 - P: 0.8479, R: 0.8886, F1: 0.8678
  Output7 - P: 0.6540, R: 0.8156, F1: 0.7259
  Output8 - P: 0.6647, R: 0.8059, F1: 0.7286
平均值 - P: 0.7263, R: 0.8316, F1: 0.7745
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7528, R: 0.8975, F1: 0.8188
  Output2 - P: 0.8475, R: 0.8598, F1: 0.8536
  Output3 - P: 0.8185, R: 0.8672, F1: 0.8421
  Output4 - P: 0.8187, R: 0.8823, F1: 0.8493
  Output5 - P: 0.8165, R: 0.8773, F1: 0.8459
  Output6 - P: 0.8925, R: 0.9265, F1: 0.9092
  Output7 - P: 0.7744, R: 0.8627, F1: 0.8161
  Output8 - P: 0.7690, R: 0.8542, F1: 0.8094
平均值 - P: 0.8112, R: 0.8784, F1: 0.8430
--------------------------------------------------
正在处理第 8/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6697, R: 0.8034, F1: 0.7305
  Output2 - P: 0.7669, R: 0.7798, F1: 0.7733
  Output3 - P: 0.7268, R: 0.7796, F1: 0.7523
  Output4 - P: 0.7354, R: 0.7872, F1: 0.7604
  Output5 - P: 0.7824, R: 0.7787, F1: 0.7806
  Output6 - P: 0.8103, R: 0.8219, F1: 0.8160
  Output7 - P: 0.6711, R: 0.7826, F1: 0.7226
  Output8 - P: 0.7153, R: 0.7827, F1: 0.7475
平均值 - P: 0.7347, R: 0.7895, F1: 0.7604
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7486, R: 0.8527, F1: 0.7973
  Output2 - P: 0.8433, R: 0.8451, F1: 0.8442
  Output3 - P: 0.8195, R: 0.8429, F1: 0.8310
  Output4 - P: 0.8196, R: 0.8423, F1: 0.8308
  Output5 - P: 0.8378, R: 0.8430, F1: 0.8404
  Output6 - P: 0.8565, R: 0.8699, F1: 0.8632
  Output7 - P: 0.7755, R: 0.8424, F1: 0.8076
  Output8 - P: 0.8074, R: 0.8388, F1: 0.8228
平均值 - P: 0.8135, R: 0.8472, F1: 0.8297
--------------------------------------------------
正在处理第 9/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6742, R: 0.8409, F1: 0.7484
  Output2 - P: 0.7741, R: 0.7855, F1: 0.7798
  Output3 - P: 0.7787, R: 0.8234, F1: 0.8004
  Output4 - P: 0.7547, R: 0.8223, F1: 0.7871
  Output5 - P: 0.7218, R: 0.8165, F1: 0.7663
  Output6 - P: 0.8048, R: 0.8448, F1: 0.8243
  Output7 - P: 0.6411, R: 0.8227, F1: 0.7207
  Output8 - P: 0.7558, R: 0.8010, F1: 0.7777
平均值 - P: 0.7382, R: 0.8196, F1: 0.7756
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  Ou

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7487, R: 0.8706, F1: 0.8050
  Output2 - P: 0.8531, R: 0.8218, F1: 0.8371
  Output3 - P: 0.8522, R: 0.8554, F1: 0.8538
  Output4 - P: 0.8374, R: 0.8646, F1: 0.8508
  Output5 - P: 0.8321, R: 0.8472, F1: 0.8396
  Output6 - P: 0.8403, R: 0.8771, F1: 0.8583
  Output7 - P: 0.7605, R: 0.8487, F1: 0.8022
  Output8 - P: 0.8341, R: 0.8349, F1: 0.8345
平均值 - P: 0.8198, R: 0.8525, F1: 0.8352
--------------------------------------------------
正在处理第 10/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6216, R: 0.7894, F1: 0.6955
  Output2 - P: 0.7583, R: 0.7619, F1: 0.7601
  Output3 - P: 0.7485, R: 0.8009, F1: 0.7738
  Output4 - P: 0.7612, R: 0.8178, F1: 0.7885
  Output5 - P: 0.7335, R: 0.7967, F1: 0.7638
  Output6 - P: 0.7687, R: 0.7736, F1: 0.7712
  Output7 - P: 0.6572, R: 0.7695, F1: 0.7089
  Output8 - P: 0.7180, R: 0.8134, F1: 0.7627
平均值 - P: 0.7209, R: 0.7904, F1: 0.7531
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7212, R: 0.8498, F1: 0.7802
  Output2 - P: 0.8571, R: 0.8420, F1: 0.8495
  Output3 - P: 0.8333, R: 0.8575, F1: 0.8452
  Output4 - P: 0.8373, R: 0.8768, F1: 0.8566
  Output5 - P: 0.8263, R: 0.8597, F1: 0.8427
  Output6 - P: 0.8329, R: 0.8521, F1: 0.8424
  Output7 - P: 0.7766, R: 0.8426, F1: 0.8082
  Output8 - P: 0.8141, R: 0.8542, F1: 0.8337
平均值 - P: 0.8124, R: 0.8543, F1: 0.8323
--------------------------------------------------
正在处理第 11/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6307, R: 0.8039, F1: 0.7068
  Output2 - P: 0.7738, R: 0.7922, F1: 0.7829
  Output3 - P: 0.7612, R: 0.8323, F1: 0.7952
  Output4 - P: 0.7444, R: 0.8307, F1: 0.7852
  Output5 - P: 0.7099, R: 0.8045, F1: 0.7543
  Output6 - P: 0.8138, R: 0.8242, F1: 0.8189
  Output7 - P: 0.6324, R: 0.7824, F1: 0.6994
  Output8 - P: 0.7027, R: 0.8141, F1: 0.7543
平均值 - P: 0.7211, R: 0.8105, F1: 0.7621
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7269, R: 0.8706, F1: 0.7923
  Output2 - P: 0.8630, R: 0.8514, F1: 0.8571
  Output3 - P: 0.8399, R: 0.8800, F1: 0.8595
  Output4 - P: 0.8236, R: 0.8803, F1: 0.8510
  Output5 - P: 0.8105, R: 0.8630, F1: 0.8359
  Output6 - P: 0.8636, R: 0.8850, F1: 0.8742
  Output7 - P: 0.7540, R: 0.8434, F1: 0.7962
  Output8 - P: 0.8058, R: 0.8524, F1: 0.8285
平均值 - P: 0.8109, R: 0.8658, F1: 0.8368
--------------------------------------------------
正在处理第 12/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6446, R: 0.8379, F1: 0.7286
  Output2 - P: 0.7585, R: 0.8147, F1: 0.7856
  Output3 - P: 0.7341, R: 0.8161, F1: 0.7729
  Output4 - P: 0.7332, R: 0.8440, F1: 0.7847
  Output5 - P: 0.7328, R: 0.8395, F1: 0.7825
  Output6 - P: 0.8271, R: 0.8409, F1: 0.8340
  Output7 - P: 0.6792, R: 0.8181, F1: 0.7422
  Output8 - P: 0.7338, R: 0.8187, F1: 0.7739
平均值 - P: 0.7304, R: 0.8287, F1: 0.7756
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7453, R: 0.8754, F1: 0.8052
  Output2 - P: 0.8405, R: 0.8708, F1: 0.8554
  Output3 - P: 0.8174, R: 0.8614, F1: 0.8388
  Output4 - P: 0.8132, R: 0.8708, F1: 0.8410
  Output5 - P: 0.8194, R: 0.8695, F1: 0.8437
  Output6 - P: 0.8625, R: 0.8922, F1: 0.8771
  Output7 - P: 0.7825, R: 0.8579, F1: 0.8185
  Output8 - P: 0.8226, R: 0.8565, F1: 0.8392
平均值 - P: 0.8129, R: 0.8693, F1: 0.8399
--------------------------------------------------
正在处理第 13/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6718, R: 0.8335, F1: 0.7440
  Output2 - P: 0.7358, R: 0.7625, F1: 0.7489
  Output3 - P: 0.6928, R: 0.7736, F1: 0.7310
  Output4 - P: 0.6680, R: 0.7674, F1: 0.7143
  Output5 - P: 0.6832, R: 0.7790, F1: 0.7279
  Output6 - P: 0.7949, R: 0.8439, F1: 0.8187
  Output7 - P: 0.6360, R: 0.7780, F1: 0.6999
  Output8 - P: 0.7227, R: 0.7785, F1: 0.7496
平均值 - P: 0.7007, R: 0.7895, F1: 0.7418
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7428, R: 0.8529, F1: 0.7941
  Output2 - P: 0.8359, R: 0.8043, F1: 0.8198
  Output3 - P: 0.7967, R: 0.8168, F1: 0.8066
  Output4 - P: 0.7849, R: 0.8066, F1: 0.7956
  Output5 - P: 0.7863, R: 0.8068, F1: 0.7964
  Output6 - P: 0.8128, R: 0.8566, F1: 0.8341
  Output7 - P: 0.7662, R: 0.8104, F1: 0.7877
  Output8 - P: 0.8211, R: 0.8105, F1: 0.8158
平均值 - P: 0.7933, R: 0.8206, F1: 0.8063
--------------------------------------------------
正在处理第 14/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5939, R: 0.7948, F1: 0.6798
  Output2 - P: 0.7651, R: 0.7881, F1: 0.7764
  Output3 - P: 0.7254, R: 0.6970, F1: 0.7109
  Output4 - P: 0.7604, R: 0.7981, F1: 0.7788
  Output5 - P: 0.6721, R: 0.8072, F1: 0.7335
  Output6 - P: 0.7657, R: 0.7943, F1: 0.7798
  Output7 - P: 0.6294, R: 0.7922, F1: 0.7015
  Output8 - P: 0.7131, R: 0.7944, F1: 0.7516
平均值 - P: 0.7031, R: 0.7833, F1: 0.7390
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7177, R: 0.8714, F1: 0.7872
  Output2 - P: 0.8682, R: 0.8696, F1: 0.8689
  Output3 - P: 0.8436, R: 0.8329, F1: 0.8382
  Output4 - P: 0.8420, R: 0.8737, F1: 0.8575
  Output5 - P: 0.7787, R: 0.8627, F1: 0.8186
  Output6 - P: 0.8449, R: 0.8768, F1: 0.8606
  Output7 - P: 0.7545, R: 0.8564, F1: 0.8022
  Output8 - P: 0.8235, R: 0.8629, F1: 0.8427
平均值 - P: 0.8091, R: 0.8633, F1: 0.8345
--------------------------------------------------
正在处理第 15/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6247, R: 0.7727, F1: 0.6909
  Output2 - P: 0.7763, R: 0.7936, F1: 0.7848
  Output3 - P: 0.7361, R: 0.7868, F1: 0.7606
  Output4 - P: 0.7397, R: 0.8105, F1: 0.7735
  Output5 - P: 0.7440, R: 0.8096, F1: 0.7754
  Output6 - P: 0.7860, R: 0.7966, F1: 0.7913
  Output7 - P: 0.6682, R: 0.7739, F1: 0.7172
  Output8 - P: 0.7049, R: 0.7786, F1: 0.7399
平均值 - P: 0.7225, R: 0.7903, F1: 0.7542
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7186, R: 0.8490, F1: 0.7784
  Output2 - P: 0.8498, R: 0.8508, F1: 0.8503
  Output3 - P: 0.8172, R: 0.8570, F1: 0.8366
  Output4 - P: 0.8140, R: 0.8639, F1: 0.8382
  Output5 - P: 0.8224, R: 0.8616, F1: 0.8416
  Output6 - P: 0.8374, R: 0.8637, F1: 0.8503
  Output7 - P: 0.7768, R: 0.8393, F1: 0.8068
  Output8 - P: 0.8078, R: 0.8418, F1: 0.8245
平均值 - P: 0.8055, R: 0.8534, F1: 0.8283
--------------------------------------------------
正在处理第 16/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6505, R: 0.8116, F1: 0.7222
  Output2 - P: 0.7414, R: 0.7607, F1: 0.7509
  Output3 - P: 0.7202, R: 0.8051, F1: 0.7603
  Output4 - P: 0.7624, R: 0.8248, F1: 0.7924
  Output5 - P: 0.7041, R: 0.8063, F1: 0.7518
  Output6 - P: 0.7944, R: 0.8347, F1: 0.8141
  Output7 - P: 0.6565, R: 0.7836, F1: 0.7144
  Output8 - P: 0.7262, R: 0.8326, F1: 0.7757
平均值 - P: 0.7195, R: 0.8074, F1: 0.7602
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7137, R: 0.8681, F1: 0.7834
  Output2 - P: 0.8345, R: 0.8241, F1: 0.8293
  Output3 - P: 0.7985, R: 0.8562, F1: 0.8263
  Output4 - P: 0.8320, R: 0.8589, F1: 0.8452
  Output5 - P: 0.7976, R: 0.8590, F1: 0.8272
  Output6 - P: 0.8457, R: 0.8750, F1: 0.8601
  Output7 - P: 0.7717, R: 0.8441, F1: 0.8063
  Output8 - P: 0.8101, R: 0.8582, F1: 0.8335
平均值 - P: 0.8005, R: 0.8554, F1: 0.8264
--------------------------------------------------
正在处理第 17/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5837, R: 0.8353, F1: 0.6872
  Output2 - P: 0.7523, R: 0.8356, F1: 0.7918
  Output3 - P: 0.7050, R: 0.8195, F1: 0.7579
  Output4 - P: 0.7007, R: 0.8204, F1: 0.7558
  Output5 - P: 0.6759, R: 0.8288, F1: 0.7446
  Output6 - P: 0.7570, R: 0.8457, F1: 0.7989
  Output7 - P: 0.6287, R: 0.8200, F1: 0.7117
  Output8 - P: 0.7230, R: 0.8324, F1: 0.7738
平均值 - P: 0.6908, R: 0.8297, F1: 0.7527
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7089, R: 0.8689, F1: 0.7808
  Output2 - P: 0.8415, R: 0.8635, F1: 0.8524
  Output3 - P: 0.7840, R: 0.8647, F1: 0.8224
  Output4 - P: 0.7973, R: 0.8654, F1: 0.8300
  Output5 - P: 0.7703, R: 0.8701, F1: 0.8171
  Output6 - P: 0.8209, R: 0.8858, F1: 0.8521
  Output7 - P: 0.7673, R: 0.8616, F1: 0.8118
  Output8 - P: 0.8133, R: 0.8626, F1: 0.8372
平均值 - P: 0.7879, R: 0.8678, F1: 0.8255
--------------------------------------------------
正在处理第 18/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5812, R: 0.7585, F1: 0.6581
  Output2 - P: 0.7655, R: 0.7909, F1: 0.7780
  Output3 - P: 0.7142, R: 0.7651, F1: 0.7388
  Output4 - P: 0.6858, R: 0.7778, F1: 0.7289
  Output5 - P: 0.6430, R: 0.7319, F1: 0.6846
  Output6 - P: 0.7250, R: 0.7564, F1: 0.7404
  Output7 - P: 0.6260, R: 0.7345, F1: 0.6759
  Output8 - P: 0.6903, R: 0.7988, F1: 0.7406
平均值 - P: 0.6789, R: 0.7642, F1: 0.7182
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7241, R: 0.8493, F1: 0.7817
  Output2 - P: 0.8494, R: 0.8450, F1: 0.8472
  Output3 - P: 0.8127, R: 0.8473, F1: 0.8296
  Output4 - P: 0.8003, R: 0.8387, F1: 0.8191
  Output5 - P: 0.7790, R: 0.8312, F1: 0.8043
  Output6 - P: 0.7973, R: 0.8509, F1: 0.8232
  Output7 - P: 0.7467, R: 0.8279, F1: 0.7852
  Output8 - P: 0.8010, R: 0.8440, F1: 0.8219
平均值 - P: 0.7888, R: 0.8418, F1: 0.8140
--------------------------------------------------
正在处理第 19/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5671, R: 0.7631, F1: 0.6506
  Output2 - P: 0.7201, R: 0.7684, F1: 0.7434
  Output3 - P: 0.6457, R: 0.7699, F1: 0.7023
  Output4 - P: 0.6585, R: 0.7753, F1: 0.7122
  Output5 - P: 0.6707, R: 0.7848, F1: 0.7233
  Output6 - P: 0.7040, R: 0.7765, F1: 0.7385
  Output7 - P: 0.5947, R: 0.7673, F1: 0.6701
  Output8 - P: 0.6181, R: 0.7370, F1: 0.6723
平均值 - P: 0.6474, R: 0.7678, F1: 0.7016
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7267, R: 0.8482, F1: 0.7828
  Output2 - P: 0.8151, R: 0.8425, F1: 0.8286
  Output3 - P: 0.7835, R: 0.8347, F1: 0.8083
  Output4 - P: 0.7906, R: 0.8508, F1: 0.8196
  Output5 - P: 0.7951, R: 0.8494, F1: 0.8214
  Output6 - P: 0.8131, R: 0.8494, F1: 0.8308
  Output7 - P: 0.7546, R: 0.8423, F1: 0.7961
  Output8 - P: 0.7728, R: 0.8184, F1: 0.7949
平均值 - P: 0.7814, R: 0.8420, F1: 0.8103
--------------------------------------------------
正在处理第 20/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6223, R: 0.8285, F1: 0.7107
  Output2 - P: 0.7584, R: 0.7845, F1: 0.7712
  Output3 - P: 0.6934, R: 0.8087, F1: 0.7467
  Output4 - P: 0.7391, R: 0.8241, F1: 0.7793
  Output5 - P: 0.7276, R: 0.8025, F1: 0.7632
  Output6 - P: 0.8132, R: 0.8275, F1: 0.8203
  Output7 - P: 0.6298, R: 0.7738, F1: 0.6944
  Output8 - P: 0.7095, R: 0.8096, F1: 0.7562
平均值 - P: 0.7117, R: 0.8074, F1: 0.7553
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7304, R: 0.8687, F1: 0.7935
  Output2 - P: 0.8498, R: 0.8493, F1: 0.8495
  Output3 - P: 0.8038, R: 0.8650, F1: 0.8333
  Output4 - P: 0.8195, R: 0.8640, F1: 0.8412
  Output5 - P: 0.8098, R: 0.8525, F1: 0.8306
  Output6 - P: 0.8570, R: 0.8726, F1: 0.8647
  Output7 - P: 0.7536, R: 0.8418, F1: 0.7953
  Output8 - P: 0.8024, R: 0.8522, F1: 0.8266
平均值 - P: 0.8033, R: 0.8583, F1: 0.8293
--------------------------------------------------
正在处理第 21/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6351, R: 0.7597, F1: 0.6918
  Output2 - P: 0.7560, R: 0.7550, F1: 0.7555
  Output3 - P: 0.7238, R: 0.7800, F1: 0.7508
  Output4 - P: 0.7239, R: 0.7758, F1: 0.7490
  Output5 - P: 0.6859, R: 0.7706, F1: 0.7258
  Output6 - P: 0.7371, R: 0.7701, F1: 0.7532
  Output7 - P: 0.6398, R: 0.7730, F1: 0.7001
  Output8 - P: 0.6993, R: 0.7761, F1: 0.7357
平均值 - P: 0.7001, R: 0.7700, F1: 0.7327
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7480, R: 0.8293, F1: 0.7865
  Output2 - P: 0.8396, R: 0.8250, F1: 0.8323
  Output3 - P: 0.8085, R: 0.8415, F1: 0.8247
  Output4 - P: 0.8118, R: 0.8388, F1: 0.8251
  Output5 - P: 0.7940, R: 0.8367, F1: 0.8148
  Output6 - P: 0.8238, R: 0.8349, F1: 0.8293
  Output7 - P: 0.7594, R: 0.8305, F1: 0.7934
  Output8 - P: 0.7914, R: 0.8196, F1: 0.8052
平均值 - P: 0.7971, R: 0.8320, F1: 0.8139
--------------------------------------------------
正在处理第 22/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.5968, R: 0.7199, F1: 0.6526
  Output2 - P: 0.7579, R: 0.7816, F1: 0.7696
  Output3 - P: 0.6719, R: 0.7669, F1: 0.7163
  Output4 - P: 0.6798, R: 0.7714, F1: 0.7227
  Output5 - P: 0.7231, R: 0.7852, F1: 0.7529
  Output6 - P: 0.7523, R: 0.7677, F1: 0.7599
  Output7 - P: 0.6257, R: 0.7618, F1: 0.6871
  Output8 - P: 0.6941, R: 0.7959, F1: 0.7415
平均值 - P: 0.6877, R: 0.7688, F1: 0.7253
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7139, R: 0.8232, F1: 0.7647
  Output2 - P: 0.8340, R: 0.8512, F1: 0.8425
  Output3 - P: 0.7877, R: 0.8447, F1: 0.8152
  Output4 - P: 0.7953, R: 0.8467, F1: 0.8202
  Output5 - P: 0.8139, R: 0.8509, F1: 0.8320
  Output6 - P: 0.8214, R: 0.8460, F1: 0.8335
  Output7 - P: 0.7599, R: 0.8368, F1: 0.7965
  Output8 - P: 0.7972, R: 0.8487, F1: 0.8222
平均值 - P: 0.7904, R: 0.8435, F1: 0.8158
--------------------------------------------------
正在处理第 23/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6165, R: 0.7510, F1: 0.6772
  Output2 - P: 0.8032, R: 0.8069, F1: 0.8051
  Output3 - P: 0.7464, R: 0.8145, F1: 0.7790
  Output4 - P: 0.7670, R: 0.8043, F1: 0.7852
  Output5 - P: 0.7913, R: 0.8288, F1: 0.8096
  Output6 - P: 0.8261, R: 0.7817, F1: 0.8033
  Output7 - P: 0.6669, R: 0.7716, F1: 0.7154
  Output8 - P: 0.7238, R: 0.8024, F1: 0.7611
平均值 - P: 0.7427, R: 0.7952, F1: 0.7670
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7159, R: 0.8462, F1: 0.7756
  Output2 - P: 0.8776, R: 0.8647, F1: 0.8711
  Output3 - P: 0.8289, R: 0.8602, F1: 0.8442
  Output4 - P: 0.8309, R: 0.8585, F1: 0.8445
  Output5 - P: 0.8433, R: 0.8748, F1: 0.8588
  Output6 - P: 0.8827, R: 0.8629, F1: 0.8727
  Output7 - P: 0.7853, R: 0.8412, F1: 0.8123
  Output8 - P: 0.8173, R: 0.8526, F1: 0.8346
平均值 - P: 0.8227, R: 0.8576, F1: 0.8392
--------------------------------------------------
正在处理第 24/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6290, R: 0.7501, F1: 0.6843
  Output2 - P: 0.7637, R: 0.7534, F1: 0.7586
  Output3 - P: 0.7389, R: 0.7786, F1: 0.7582
  Output4 - P: 0.6986, R: 0.7813, F1: 0.7376
  Output5 - P: 0.7588, R: 0.7674, F1: 0.7631
  Output6 - P: 0.7574, R: 0.7613, F1: 0.7593
  Output7 - P: 0.6480, R: 0.7581, F1: 0.6988
  Output8 - P: 0.7202, R: 0.7855, F1: 0.7515
平均值 - P: 0.7143, R: 0.7670, F1: 0.7389
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7216, R: 0.8375, F1: 0.7752
  Output2 - P: 0.8479, R: 0.8280, F1: 0.8378
  Output3 - P: 0.8212, R: 0.8463, F1: 0.8336
  Output4 - P: 0.7980, R: 0.8445, F1: 0.8206
  Output5 - P: 0.8313, R: 0.8452, F1: 0.8382
  Output6 - P: 0.8227, R: 0.8461, F1: 0.8342
  Output7 - P: 0.7702, R: 0.8328, F1: 0.8003
  Output8 - P: 0.8143, R: 0.8380, F1: 0.8260
平均值 - P: 0.8034, R: 0.8398, F1: 0.8207
--------------------------------------------------
正在处理第 25/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6458, R: 0.7965, F1: 0.7133
  Output2 - P: 0.7980, R: 0.8006, F1: 0.7993
  Output3 - P: 0.7443, R: 0.8150, F1: 0.7780
  Output4 - P: 0.6923, R: 0.7836, F1: 0.7351
  Output5 - P: 0.7202, R: 0.8049, F1: 0.7602
  Output6 - P: 0.7865, R: 0.7878, F1: 0.7872
  Output7 - P: 0.6589, R: 0.7851, F1: 0.7165
  Output8 - P: 0.6816, R: 0.8066, F1: 0.7389
平均值 - P: 0.7160, R: 0.7975, F1: 0.7536
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7413, R: 0.8649, F1: 0.7984
  Output2 - P: 0.8788, R: 0.8678, F1: 0.8733
  Output3 - P: 0.8160, R: 0.8641, F1: 0.8394
  Output4 - P: 0.7996, R: 0.8488, F1: 0.8235
  Output5 - P: 0.8229, R: 0.8639, F1: 0.8429
  Output6 - P: 0.8292, R: 0.8598, F1: 0.8442
  Output7 - P: 0.7753, R: 0.8514, F1: 0.8116
  Output8 - P: 0.7871, R: 0.8509, F1: 0.8178
平均值 - P: 0.8063, R: 0.8590, F1: 0.8314
--------------------------------------------------
正在处理第 26/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6457, R: 0.7767, F1: 0.7052
  Output2 - P: 0.7838, R: 0.7770, F1: 0.7804
  Output3 - P: 0.7510, R: 0.8183, F1: 0.7832
  Output4 - P: 0.7564, R: 0.7958, F1: 0.7756
  Output5 - P: 0.7227, R: 0.8059, F1: 0.7621
  Output6 - P: 0.8241, R: 0.7941, F1: 0.8088
  Output7 - P: 0.6553, R: 0.7902, F1: 0.7164
  Output8 - P: 0.7343, R: 0.8070, F1: 0.7690
平均值 - P: 0.7342, R: 0.7956, F1: 0.7626
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7282, R: 0.8486, F1: 0.7838
  Output2 - P: 0.8686, R: 0.8510, F1: 0.8597
  Output3 - P: 0.8352, R: 0.8662, F1: 0.8504
  Output4 - P: 0.8334, R: 0.8587, F1: 0.8459
  Output5 - P: 0.8142, R: 0.8636, F1: 0.8382
  Output6 - P: 0.8785, R: 0.8666, F1: 0.8725
  Output7 - P: 0.7771, R: 0.8477, F1: 0.8109
  Output8 - P: 0.8292, R: 0.8583, F1: 0.8435
平均值 - P: 0.8206, R: 0.8576, F1: 0.8381
--------------------------------------------------
正在处理第 27/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6250, R: 0.8299, F1: 0.7131
  Output2 - P: 0.7871, R: 0.8222, F1: 0.8043
  Output3 - P: 0.7318, R: 0.8217, F1: 0.7741
  Output4 - P: 0.7163, R: 0.8284, F1: 0.7682
  Output5 - P: 0.7422, R: 0.8289, F1: 0.7832
  Output6 - P: 0.8254, R: 0.8592, F1: 0.8420
  Output7 - P: 0.6408, R: 0.8077, F1: 0.7146
  Output8 - P: 0.7627, R: 0.8292, F1: 0.7946
平均值 - P: 0.7289, R: 0.8284, F1: 0.7743
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7461, R: 0.8678, F1: 0.8023
  Output2 - P: 0.8595, R: 0.8579, F1: 0.8587
  Output3 - P: 0.8174, R: 0.8618, F1: 0.8391
  Output4 - P: 0.8076, R: 0.8678, F1: 0.8366
  Output5 - P: 0.8329, R: 0.8668, F1: 0.8495
  Output6 - P: 0.8690, R: 0.8935, F1: 0.8811
  Output7 - P: 0.7689, R: 0.8622, F1: 0.8129
  Output8 - P: 0.8468, R: 0.8586, F1: 0.8526
平均值 - P: 0.8185, R: 0.8670, F1: 0.8416
--------------------------------------------------
正在处理第 28/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6004, R: 0.8242, F1: 0.6947
  Output2 - P: 0.7798, R: 0.8270, F1: 0.8027
  Output3 - P: 0.7431, R: 0.8286, F1: 0.7835
  Output4 - P: 0.7292, R: 0.8144, F1: 0.7694
  Output5 - P: 0.6890, R: 0.8182, F1: 0.7481
  Output6 - P: 0.7707, R: 0.8457, F1: 0.8065
  Output7 - P: 0.6516, R: 0.8187, F1: 0.7257
  Output8 - P: 0.6715, R: 0.8116, F1: 0.7350
平均值 - P: 0.7044, R: 0.8236, F1: 0.7582
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7275, R: 0.8762, F1: 0.7949
  Output2 - P: 0.8590, R: 0.8674, F1: 0.8631
  Output3 - P: 0.8319, R: 0.8748, F1: 0.8528
  Output4 - P: 0.8304, R: 0.8767, F1: 0.8529
  Output5 - P: 0.8023, R: 0.8710, F1: 0.8352
  Output6 - P: 0.8429, R: 0.9035, F1: 0.8721
  Output7 - P: 0.7759, R: 0.8723, F1: 0.8213
  Output8 - P: 0.7964, R: 0.8616, F1: 0.8277
平均值 - P: 0.8083, R: 0.8754, F1: 0.8400
--------------------------------------------------
正在处理第 29/29 行...

使用模型 distilbert-base-uncased 计算BERTScore:
  Output1 - P: 0.6460, R: 0.7790, F1: 0.7063
  Output2 - P: 0.7863, R: 0.7872, F1: 0.7868
  Output3 - P: 0.7316, R: 0.7940, F1: 0.7615
  Output4 - P: 0.7395, R: 0.7798, F1: 0.7592
  Output5 - P: 0.7192, R: 0.7828, F1: 0.7496
  Output6 - P: 0.8031, R: 0.8001, F1: 0.8016
  Output7 - P: 0.6468, R: 0.8014, F1: 0.7158
  Output8 - P: 0.7051, R: 0.7910, F1: 0.7456
平均值 - P: 0.7222, R: 0.7894, F1: 0.7533
--------------------------------------------------

使用模型 xlm-roberta-large 计算BERTScore:
  O

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Output1 - P: 0.7384, R: 0.8615, F1: 0.7952
  Output2 - P: 0.8580, R: 0.8671, F1: 0.8625
  Output3 - P: 0.8205, R: 0.8653, F1: 0.8423
  Output4 - P: 0.8363, R: 0.8615, F1: 0.8487
  Output5 - P: 0.8146, R: 0.8676, F1: 0.8403
  Output6 - P: 0.8642, R: 0.8806, F1: 0.8723
  Output7 - P: 0.7764, R: 0.8614, F1: 0.8167
  Output8 - P: 0.8143, R: 0.8560, F1: 0.8346
平均值 - P: 0.8153, R: 0.8651, F1: 0.8391
--------------------------------------------------
处理完成，结果已保存至 D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output_bert_score_multi_models_fixed.csv


In [2]:
import pandas as pd
import numpy as np

input_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output_bert_score_multi_models_fixed.csv'
output_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\bertscore_output_averages.csv'

# 定义 output 对应的真实名称
output_name_map = {
    'output1': 'MindMap',
    'output2': 'GPT3.5',
    'output3': 'BM25_retrieval',
    'output4': 'Embedding_retrieval',
    'output5': 'KG_retrieval',
    'output6': 'KG_self-consistency',
    'output7': 'KG_TreeOfThoughts',
    'output8': 'GPT4'
}

try:
    df = pd.read_csv(input_file)
    print(f"成功读取CSV文件，共有{len(df)}行")

    # Detect model prefixes from column names
    model_prefixes = set()
    for col in df.columns:
        if '_output' in col and ('_precision' in col or '_recall' in col or '_f1' in col):
            prefix = col.split('_output', 1)[0]
            model_prefixes.add(prefix)

    print(f"检测到的模型: {sorted(list(model_prefixes))}")

    # 1. Calculate average score ACROSS MODELS for each output
    results = []
    for i in range(1, 9):
        output_metrics = {'output_number': f'output{i}', 'avg_precision': 0.0, 'avg_recall': 0.0, 'avg_f1': 0.0, 'count': 0}
        for prefix in model_prefixes:
            precision_col = f"{prefix}_output{i}_precision"
            recall_col = f"{prefix}_output{i}_recall"
            f1_col = f"{prefix}_output{i}_f1"

            if all(col in df.columns for col in [precision_col, recall_col, f1_col]):
                avg_precision = df[precision_col].mean(skipna=True)
                avg_recall = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not np.isnan(avg_precision) and not np.isnan(avg_recall) and not np.isnan(avg_f1):
                    output_metrics['avg_precision'] += avg_precision
                    output_metrics['avg_recall'] += avg_recall
                    output_metrics['avg_f1'] += avg_f1
                    output_metrics['count'] += 1

        if output_metrics['count'] > 0:
            for metric in ['avg_precision', 'avg_recall', 'avg_f1']:
                output_metrics[metric] /= output_metrics['count']
            results.append(output_metrics)

    # 2. Calculate OVERALL average across all models and outputs
    if results:
        overall_avg = {
            'output_number': 'overall',
            'avg_precision': np.mean([r['avg_precision'] for r in results]),
            'avg_recall': np.mean([r['avg_recall'] for r in results]),
            'avg_f1': np.mean([r['avg_f1'] for r in results])
        }
        results.append(overall_avg)

    # 3. Per-model averages + detailed per-output per-model
    model_results = []
    detailed_model_output_results = []

    for prefix in sorted(list(model_prefixes)):
        model_metrics = {'model': prefix, 'avg_precision': 0.0, 'avg_recall': 0.0, 'avg_f1': 0.0, 'output_count': 0}
        for i in range(1, 9):
            precision_col = f"{prefix}_output{i}_precision"
            recall_col = f"{prefix}_output{i}_recall"
            f1_col = f"{prefix}_output{i}_f1"

            if all(col in df.columns for col in [precision_col, recall_col, f1_col]):
                avg_precision = df[precision_col].mean(skipna=True)
                avg_recall = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not np.isnan(avg_precision) and not np.isnan(avg_recall) and not np.isnan(avg_f1):
                    detailed_model_output_results.append({
                        'model': prefix,
                        'output_number': f'output{i}',
                        'avg_precision': avg_precision,
                        'avg_recall': avg_recall,
                        'avg_f1': avg_f1
                    })

                    model_metrics['avg_precision'] += avg_precision
                    model_metrics['avg_recall'] += avg_recall
                    model_metrics['avg_f1'] += avg_f1
                    model_metrics['output_count'] += 1

        if model_metrics['output_count'] > 0:
            for metric in ['avg_precision', 'avg_recall', 'avg_f1']:
                model_metrics[metric] /= model_metrics['output_count']
            del model_metrics['output_count']
            model_results.append(model_metrics)

    # Save combined results
    final_df = pd.concat([pd.DataFrame(results), pd.DataFrame(model_results)], ignore_index=True)
    final_df.to_csv(output_file, index=False, float_format='%.6f')
    print(f"\n聚合平均结果已保存到 {output_file}")

    # --- Printing Tables ---

    # Table 1: Sorted by avg_f1
    print("\n各输出的平均BERTScore (按F1从高到低排序):")
    print("=" * 80)
    print(f"{'输出名称':^25} | {'平均Precision':^15} | {'平均Recall':^15} | {'平均F1':^15}")
    print("-" * 80)

    sorted_results = sorted(
        [r for r in results if r['output_number'] != 'overall'],
        key=lambda x: x['avg_f1'],
        reverse=True
    )

    for r in sorted_results:
        output_number = r.get('output_number', 'N/A')
        display_name = output_name_map.get(output_number, output_number) + f" ({output_number})"
        print(f"{display_name:<25} | {r.get('avg_precision', 0):^15.4f} | {r.get('avg_recall', 0):^15.4f} | {r.get('avg_f1', 0):^15.4f}")

    # Print overall row
    overall_row = next((r for r in results if r['output_number'] == 'overall'), None)
    if overall_row:
        print("-" * 80)
        print(f"{'OVERALL':<25} | {overall_row.get('avg_precision', 0):^15.4f} | {overall_row.get('avg_recall', 0):^15.4f} | {overall_row.get('avg_f1', 0):^15.4f}")
    print("-" * 80)

    # Table 2: Average per model
    print("\n各模型的平均BERTScore (所有输出平均):")
    print("=" * 65)
    print(f"{'模型':^20} | {'平均Precision':^15} | {'平均Recall':^15} | {'平均F1':^15}")
    print("-" * 65)
    model_results_print_df = pd.DataFrame(model_results).fillna(0)
    for index, m in model_results_print_df.iterrows():
        print(f"{m.get('model', 'N/A'):<20} | {m.get('avg_precision', 0):^15.4f} | {m.get('avg_recall', 0):^15.4f} | {m.get('avg_f1', 0):^15.4f}")
    print("-" * 65)

    # Table 3: Detailed per model and output, sorted by F1
    print("\n各模型在各输出上的详细平均BERTScore (按F1从高到低排序):")
    print("=" * 100)
    print(f"{'模型':^20} | {'输出名称':^25} | {'平均Precision':^15} | {'平均Recall':^15} | {'平均F1':^15}")
    print("-" * 100)
    detailed_df = pd.DataFrame(detailed_model_output_results)
    if not detailed_df.empty:
        detailed_df = detailed_df.fillna(0)
        # 按 avg_f1 从高到低排序
        detailed_df = detailed_df.sort_values(by=['avg_f1'], ascending=False)
        for _, d in detailed_df.iterrows():
            output_number = d.get('output_number', 'N/A')
            display_name = output_name_map.get(output_number, output_number) + f" ({output_number})"
            print(f"{d.get('model', 'N/A'):<20} | {display_name:<25} | {d.get('avg_precision', 0):^15.4f} | {d.get('avg_recall', 0):^15.4f} | {d.get('avg_f1', 0):^15.4f}")
        print("-" * 100)
    else:
        print("⚠ 没有可用的详细模型输出数据，跳过详细表格打印。")

except FileNotFoundError:
    print(f"错误: 输入文件 '{input_file}' 未找到。")
except Exception as e:
    print(f"处理过程中发生错误: {e}")
    import traceback
    traceback.print_exc()


成功读取CSV文件，共有29行
检测到的模型: ['distilbert', 'roberta', 'xlm']

聚合平均结果已保存到 D:\liulanqi1\MindMap-main\MindMap-main\实验结果\bertscore_output_averages.csv

各输出的平均BERTScore (按F1从高到低排序):
          输出名称            |   平均Precision   |    平均Recall     |      平均F1      
--------------------------------------------------------------------------------
KG_self-consistency (output6) |     0.8316      |     0.8577      |     0.8443     
GPT3.5 (output2)          |     0.8263      |     0.8387      |     0.8321     
Embedding_retrieval (output4) |     0.7930      |     0.8467      |     0.8187     
BM25_retrieval (output3)  |     0.7900      |     0.8429      |     0.8153     
KG_retrieval (output5)    |     0.7814      |     0.8451      |     0.8116     
GPT4 (output8)            |     0.7814      |     0.8407      |     0.8095     
KG_TreeOfThoughts (output7) |     0.7363      |     0.8341      |     0.7815     
MindMap (output1)         |     0.7075      |     0.8463      |     0.7701     
----------------

In [3]:
import pandas as pd
import numpy as np

input_file = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果\output_bert_score_multi_models_fixed.csv'
output_file_base = r'D:\liulanqi1\MindMap-main\MindMap-main\实验结果'

# 定义 output 对应的真实名称
output_name_map = {
    'output1': 'MindMap',
    'output2': 'GPT3.5',
    'output3': 'BM25_retrieval',
    'output4': 'Embedding_retrieval',
    'output5': 'KG_retrieval',
    'output6': 'KG_self-consistency',
    'output7': 'KG_TreeOfThoughts',
    'output8': 'GPT4'
}

try:
    df = pd.read_csv(input_file)
    print(f"成功读取CSV文件，共有{len(df)}行")

    model_prefixes = set()
    for col in df.columns:
        if '_output' in col and ('_precision' in col or '_recall' in col or '_f1' in col):
            prefix = col.split('_output', 1)[0]
            model_prefixes.add(prefix)

    print(f"检测到的模型: {sorted(list(model_prefixes))}")

    # 1. Calculate average score ACROSS MODELS for each output
    results = []
    for i in range(1, 9):
        output_metrics = {'output_number': f'output{i}', 'output_name': output_name_map.get(f'output{i}', f'output{i}'),
                          'avg_precision': 0.0, 'avg_recall': 0.0, 'avg_f1': 0.0, 'count': 0}
        for prefix in model_prefixes:
            precision_col = f"{prefix}_output{i}_precision"
            recall_col = f"{prefix}_output{i}_recall"
            f1_col = f"{prefix}_output{i}_f1"

            if all(col in df.columns for col in [precision_col, recall_col, f1_col]):
                avg_precision = df[precision_col].mean(skipna=True)
                avg_recall = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not np.isnan(avg_precision) and not np.isnan(avg_recall) and not np.isnan(avg_f1):
                    output_metrics['avg_precision'] += avg_precision
                    output_metrics['avg_recall'] += avg_recall
                    output_metrics['avg_f1'] += avg_f1
                    output_metrics['count'] += 1

        if output_metrics['count'] > 0:
            for metric in ['avg_precision', 'avg_recall', 'avg_f1']:
                output_metrics[metric] /= output_metrics['count']
            results.append(output_metrics)

    # 2. Calculate OVERALL average
    if results:
        overall_avg = {
            'output_number': 'overall',
            'output_name': 'Overall',
            'avg_precision': np.mean([r['avg_precision'] for r in results]),
            'avg_recall': np.mean([r['avg_recall'] for r in results]),
            'avg_f1': np.mean([r['avg_f1'] for r in results])
        }
        results.append(overall_avg)

    # 3. Per-model averages + detailed per-output per-model
    model_results = []
    detailed_model_output_results = []

    for prefix in sorted(list(model_prefixes)):
        model_metrics = {'model': prefix, 'avg_precision': 0.0, 'avg_recall': 0.0, 'avg_f1': 0.0, 'output_count': 0}
        for i in range(1, 9):
            precision_col = f"{prefix}_output{i}_precision"
            recall_col = f"{prefix}_output{i}_recall"
            f1_col = f"{prefix}_output{i}_f1"

            if all(col in df.columns for col in [precision_col, recall_col, f1_col]):
                avg_precision = df[precision_col].mean(skipna=True)
                avg_recall = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not np.isnan(avg_precision) and not np.isnan(avg_recall) and not np.isnan(avg_f1):
                    detailed_model_output_results.append({
                        'model': prefix,
                        'output_number': f'output{i}',
                        'output_name': output_name_map.get(f'output{i}', f'output{i}'),
                        'avg_precision': avg_precision,
                        'avg_recall': avg_recall,
                        'avg_f1': avg_f1
                    })

                    model_metrics['avg_precision'] += avg_precision
                    model_metrics['avg_recall'] += avg_recall
                    model_metrics['avg_f1'] += avg_f1
                    model_metrics['output_count'] += 1

        if model_metrics['output_count'] > 0:
            for metric in ['avg_precision', 'avg_recall', 'avg_f1']:
                model_metrics[metric] /= model_metrics['output_count']
            del model_metrics['output_count']
            model_results.append(model_metrics)

    # === Save sorted outputs (by F1) ===
    outputs_df = pd.DataFrame([r for r in results if r['output_number'] != 'overall'])
    outputs_df_sorted = outputs_df.sort_values(by='avg_f1', ascending=False)
    outputs_df_sorted.to_csv(f"{output_file_base}\\outputs_sorted.csv", index=False, float_format='%.6f')
    print(f"已保存按F1排序的输出到 {output_file_base}\\outputs_sorted.csv")

    # === Save overall model averages ===
    model_df = pd.DataFrame(model_results)
    model_df.to_csv(f"{output_file_base}\\models_average.csv", index=False, float_format='%.6f')
    print(f"已保存模型平均值到 {output_file_base}\\models_average.csv")

    # === Save detailed per-model per-output sorted by F1 ===
    detailed_df = pd.DataFrame(detailed_model_output_results)
    detailed_df_sorted = detailed_df.sort_values(by='avg_f1', ascending=False)
    detailed_df_sorted.to_csv(f"{output_file_base}\\detailed_sorted.csv", index=False, float_format='%.6f')
    print(f"已保存按F1排序的详细表到 {output_file_base}\\detailed_sorted.csv")

except FileNotFoundError:
    print(f"错误: 输入文件 '{input_file}' 未找到。")
except Exception as e:
    print(f"处理过程中发生错误: {e}")
    import traceback
    traceback.print_exc()


成功读取CSV文件，共有29行
检测到的模型: ['distilbert', 'roberta', 'xlm']
已保存按F1排序的输出到 D:\liulanqi1\MindMap-main\MindMap-main\实验结果\outputs_sorted.csv
已保存模型平均值到 D:\liulanqi1\MindMap-main\MindMap-main\实验结果\models_average.csv
已保存按F1排序的详细表到 D:\liulanqi1\MindMap-main\MindMap-main\实验结果\detailed_sorted.csv
