In [1]:
# coding=utf-8
import json
import sys
import os
from nltk.translate.bleu_score import corpus_bleu,sentence_bleu
from rouge import Rouge
from sentence_transformers import SentenceTransformer, util
sys.setrecursionlimit(2000)

class DatasizeError(Exception):
    def __init__(self, message) :
        super().__init__(message)
        self.message=message

class SampleError(Exception):
    def __init__(self, message) :
        super().__init__(message)
        self.message=message

class CaseidError(Exception):
    def __init__(self, message) :
        super().__init__(message)
        self.message=message

error_msg={
    1: "Wrong data size",
    2: "Wrong sample format",
    3: "Wrong case id"
}

def dump_2_json(info, path):
    with open(path, 'w') as output_json_file:
        json.dump(info, output_json_file)

def report_error_msg(detail, showMsg, out_p):
    error_dict=dict()
    error_dict['errorDetail']=detail
    error_dict['errorMsg']=showMsg
    error_dict['score']=0
    error_dict['scoreJson']={}
    error_dict['success']=False
    dump_2_json(error_dict,out_p)

def report_score(score, out_p):
    result = dict()
    result['success']=True
    total_score = score['Edit_acc']['final_score'] * 0.2 + score['portability']['final_score'] * 0.35 + score['locality']['final_score']  * 0.35 + score['fluency'] * 0.1
    result['score'] = total_score
    result['scoreJson'] = {'score': total_score, 'Edit_acc':score['Edit_acc']['final_score'], 'portability':score['portability']['final_score'], 'locality':score['locality']['final_score'], 'fluency':score['fluency']}
    print(result['scoreJson'])
    dump_2_json(result,out_p)

def sample_format(sample_list):
    tag=True
    for x in sample_list:                                                          
        list1 = x.keys()
        # list2 = x['pre'].keys()
        list3 = x['requested_rewrite'].keys()
        list4 = x['post'].keys()
        # if(list(list1)!=['pre', 'case_id', 'requested_rewrite', 'post']):
        if(list(list1)!=['case_id', 'requested_rewrite', 'post']):
            tag=False
            break
        # elif(list(list2)!=['rewrite_ans','rephrase_ans','portability_ans'] and list(list2)!=['rewrite_ans','rephrase_ans','locality_ans','portability_ans']):
        #     tag=False
        #     break
        elif('target_new' not in list3 or 'portability' not in list3 or 'locality' not in list3):
            tag=False
            break
        # fluency_new
        # elif(list(list4)!=['rewrite_ans','rephrase_ans','locality_ans','portability_ans','fluency', 'fluency_new'] and \
        #       list(list4)!=['rewrite_ans','rephrase_ans','portability_ans','fluency', 'fluency_new']) or \
        #       (list(list4)!=['rewrite_ans','rephrase_ans','locality_ans','portability_ans','fluency'] and \
        #       list(list4)!=['rewrite_ans','rephrase_ans','portability_ans','fluency']):
        #     tag=False
        #     break  
    return tag

def test_case_id(sample_list):
    tag =True
    for x in range(len(sample_list)-1):
        if(sample_list[x+1]['case_id']!=sample_list[x]['case_id']+1):
            tag = False
            break
    return tag

def check_format(submit_p):
    with open(submit_p, 'r',encoding='utf-8') as file:
        if 'log' in submit_p:
            lines = file.readlines()
            submit_file = []
            for line in lines:
                submit_file.append(json.loads(line))
        else:
            submit_file=json.load(file)
    # if len(submit_file)!=700:
    #     raise DatasizeError("Wrong data size")
    if (not sample_format(submit_file)):
        raise SampleError("Wrong sample format")
    if (not test_case_id(submit_file)):
        raise CaseidError("Wrong case id")

def compute_acc(answers,outputs):
    # model_path = './paraphrase-multilingual-MiniLM-L12-v2'
    model_path = '/share/huggingface/paraphrase-multilingual-MiniLM-L12-v2'
    bleu_scores = []
    rouge1s=[]
    rouge2s=[]
    rougels=[]
    rouge = Rouge()
    for an,ou in zip(answers,outputs):
        score = sentence_bleu([an], ou)
        bleu_scores.append(score)
        scores = rouge.get_scores(ou,an)
        rouge1s.append(scores[0]['rouge-1']['r'])
        rouge2s.append(scores[0]['rouge-2']['r'])
        rougels.append(scores[0]['rouge-l']['r'])

    temp_metrics = {}
    temp_metrics['BLEU SCORE'] = sum(bleu_scores) / len(bleu_scores)
    temp_metrics['ROUGE-1'] = sum(rouge1s) / len(rouge1s)
    temp_metrics['ROUGE-2'] = sum(rouge2s) / len(rouge2s)
    temp_metrics['ROUGE-L'] = sum(rougels) / len(rougels)

    model = SentenceTransformer(model_path, device="cpu")

    embeddings1 = model.encode(answers, convert_to_tensor=True)
    embeddings2 = model.encode(outputs, convert_to_tensor=True)

    # Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    temp_metrics['Bert Score'] = cosine_scores.diagonal().mean().item()
    temp_metrics['final_score'] = (temp_metrics['ROUGE-L']+temp_metrics['Bert Score'])/2
    temp_metrics['final_score'] = temp_metrics['final_score']*100
    
    return temp_metrics

def eval_score(result_path):
    with open(result_path, 'r', encoding='utf-8') as file:
        if 'log' in result_path:
            lines = file.readlines()
            print(f'data size: {len(lines)}')
            data = []
            for line in lines:
                data.append(json.loads(line))
        else:
            data=json.load(file)
    metrics = {}

    #evaluate Edit_acc
    rewrite_answer = [i['requested_rewrite']['target_new'] for i in data]
    rewrite_outputs = [i['post']['rewrite_ans'] for i in data]
    metrics['Edit_acc'] = compute_acc(rewrite_answer,rewrite_outputs)

    #evaluate portability
    portability_answer = []
    portability_outputs = []
    for item in data:
        for an in item['requested_rewrite']['portability']['por_hop']['ground_truth']:
            portability_answer.append(an)
        for ou in item['post']['portability_ans']:
            portability_outputs.append(ou)
    metrics['portability'] = compute_acc(portability_answer,portability_outputs)

    #evaluate locality
    locality_answer = []
    locality_outputs = []
    for item in data:
        if ('locality_ans' not in item['post'].keys() or len(item['requested_rewrite']['locality']['loc_hop']['prompt'])==0):
            continue
        for an in item['requested_rewrite']['locality']['loc_hop']['ground_truth']:
            locality_answer.append(an)
        for ou in item['post']['locality_ans']:
            locality_outputs.append(ou)
    if len(locality_answer)!=0 and len(locality_outputs)!=0:
        metrics['locality'] = compute_acc(locality_answer,locality_outputs)
    else:
        metrics['locality'] = {'final_score': 0}

     #evaluate fluency
    fluencys = [i['post']['fluency']['ngram_entropy'] for i in data]
    metrics['fluency'] = sum(fluencys) / len(fluencys) *10

    return metrics

In [2]:
root_dir = '../ccks2024_output/type5_70/result'

file_names = os.listdir(root_dir)
print(file_names)

for file in file_names:
    if '_tt' not in file:
        continue
    try:
        submit_path = os.path.join(root_dir, file)
        out_path = os.path.join('./metrics_results', file)
        check_format(submit_path)
        score = eval_score(submit_path)
        report_score(score, out_path)
        print(file)
    except Exception as e:
        print(e)

['0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.w1-knb_dict-mean-bs1_100_p99.75_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt15.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.w1-knb_dict-mean-bs1_100_p99.55_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt10.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.w1-knb_dict-mean-bs1_100_p99.55_rsTrue_a1_pd0.1_bias_none_t_loss0.001_wd0.json', '0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,70_type5_70-Qwen-1_8B-Ch

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

{'score': 26.651116374222767, 'Edit_acc': 28.541632039206366, 'portability': 23.22760969400406, 'locality': 25.80854594707489, 'fluency': 37.801354920038605}
0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt15.json
{'score': 25.246312032951554, 'Edit_acc': 29.500752772603718, 'portability': 22.24491387605667, 'locality': 23.17572832107544, 'fluency': 34.48936709434574}
0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt10.json
{'score': 27.435813945075687, 'Edit_acc': 35.37228115967342, 'portability': 25.351692807106748, 'locality': 22.471266984939575, 'fluency': 36.233217859247894}
0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json
{'score': 26.856243665946177, 'Edit_acc': 32.72997753960746, 'portability': 23.613665997982025, 'locality'

{'score': 27.435813945075687, 'Edit_acc': 35.37228115967342, 'portability': 25.351692807106748, 'locality': 22.471266984939575, 'fluency': 36.233217859247894}
0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json

0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt4.json
{'Edit_acc': {'BLEU SCORE': 0.018000606663713756, 'ROUGE-1': 0.014285714285714285, 'ROUGE-2': 0.0, 'ROUGE-L': 0.014285714285714285, 'Bert Score': 0.633683443069458, 'final_score': 32.39845786775861}, 'portability': {'BLEU SCORE': 0.0028442105782886216, 'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0, 'Bert Score': 0.45265546441078186, 'final_score': 22.632773220539093}, 'locality': {'BLEU SCORE': 0.0006666877137198298, 'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0, 'Bert Score': 0.47480714321136475, 'final_score': 23.740357160568237}, 'fluency': 37.723844043301035}

In [3]:
root_dir = '../ccks2024_output/type2_80/result'

file_names = os.listdir(root_dir)
print(file_names)

for file in file_names:
    if '_tt' not in file:
        continue
    try:
        submit_path = os.path.join(root_dir, file)
        out_path = os.path.join('./metrics_results', file)
        check_format(submit_path)
        score = eval_score(submit_path)
        report_score(score, out_path)
        print(file)
    except Exception as e:
        print(e)

['0,80_type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-mean-bs10_120_p99.25_rsTrue_a1_pd0.1_bias_knb_only_t_loss0.001.json', 'type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-mean-bs20_120_p99.05_rsTrue_a1_pd0_bias_knb_only_t_loss0.001.json', 'type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-mean-bs20_120_p99.95_rsTrue_a1_pd0_bias_knb_only_t_loss0.001.json', 'type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-max-bs20_120_p95.0_rsTrue_a1_pd0_bias_knb_only_t_loss0.001.json', '0,80_type2_80-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', 'type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-max-bs20_120_p90.0_rsTrue_a1_pd0_bias_knb_only_t_loss0.001.json', '0,80_type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-mean-bs10_120_p90.0_rsTrue_a1_pd0.1_bias_knb_only_t_loss0.001.json', '0,80_type2_80-Qwen-1_8B-Chat-CKnowEdit-0-24-mlp.c_proj-knb_dict-mean-bs10_120_

In [4]:
root_dir = '../ccks2024_output/type3_40/result'

file_names = os.listdir(root_dir)
print(file_names)

for file in file_names:
    if '_tt' not in file:
        continue
    try:
        submit_path = os.path.join(root_dir, file)
        out_path = os.path.join('./metrics_results', file)
        check_format(submit_path)
        score = eval_score(submit_path)
        report_score(score, out_path)
        print(file)
    except Exception as e:
        print(e)

['0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.95_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt2.json', '0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt2.json', '0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.65_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json']
Hypothesis is empty.
{'score': 19.682116829905024, 'Edit_acc': 32.19741880893707, 'portability': 25.776910135544927, 'locality': 0, 'fluency': 

{'score': 20.610242258505274, 'Edit_acc': 35.02366900444031, 'portability': 26.348072290420532, 'locality': 0, 'fluency': 43.83683155970026}
0,40_type3_40-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.65_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json

In [5]:
root_dir = '../ccks2024_output/type4_50/result'

file_names = os.listdir(root_dir)
print(file_names)

for file in file_names:
    if '_tt' not in file:
        continue
    try:
        submit_path = os.path.join(root_dir, file)
        out_path = os.path.join('./metrics_results', file)
        check_format(submit_path)
        score = eval_score(submit_path)
        report_score(score, out_path)
        print(file)
    except Exception as e:
        print(e)

['0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.05_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.95_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.65_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.95_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0_tt2.json', '0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json', '

{'score': 40.22284491820564, 'Edit_acc': 49.98349289099375, 'portability': 36.114828250347045, 'locality': 37.92186390470575, 'fluency': 43.133040857384124}
0,50_type4_50-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.65_rsTrue_a1_pd0.1_bias_none_t_loss0.01_wd0_tt2.json

{'Edit_acc': {'BLEU SCORE': 0.04142679868537507, 'ROUGE-1': 0.02857142857142857, 'ROUGE-2': 0.0, 'ROUGE-L': 0.02857142857142857, 'Bert Score': 0.7325635552406311, 'final_score': 38.056749190602986}, 'portability': {'BLEU SCORE': 0.0009528046358681467, 'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0, 'Bert Score': 0.48874256014823914, 'final_score': 24.437128007411957}, 'locality': {'BLEU SCORE': 0.0050658566077895545, 'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0, 'Bert Score': 0.5525553226470947, 'final_score': 27.627766132354736}, 'fluency': 37.969656644401056}
0,70_type5_70-Qwen-1_8B-Chat-CKnowEdit-layer-0-24-mlp.c_proj-knb_dict-mean-bs1_100_p99.35_rsTrue_a1_pd0.1_bias_none_t_loss0.1_wd0.json