In [None]:
import os
import json

In [None]:
import copy

In [None]:
import numpy as np

In [None]:
from rouge import Rouge 
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate import meteor
import nltk

In [None]:
import pandas as pd

# open result

In [None]:
RESULT_PATH = "output_inference/deep_api_all_100/results_computed.json"

In [None]:
with open(RESULT_PATH, 'r') as f:
    results = json.load(f)

results

# compute largest common subset

In [None]:
rouge = Rouge()
def calculate_rouge(prediction, ground_truth):
    scores = rouge.get_scores(prediction, ground_truth)
    return scores

chencherry = SmoothingFunction()

def calculate_bleu(prediction, ground_truth):
    prediction = prediction.split()
    score = sentence_bleu([ground_truth], prediction, smoothing_function=chencherry.method1, weights=(1, 0))
    return score

def calculate_meteor(prediction, ground_truth):
    prediction = prediction.split()
    meteor_score = round(meteor([ground_truth],prediction), 4)
    return meteor_score

def get_n_common_elements(result_dict):
    result_dict_cp = copy.deepcopy(result_dict)
    
    ground_truth = result_dict_cp.get('ground_truth', [])
    assert(len(ground_truth) > 0)
    
    preds = result_dict_cp.get('preds', [])
    preds = [pred for pred in preds if pred != '']
    assert(len(preds) > 0)
    
    n_common_elements = []
    
    for pred in preds:
        pred = pred.split()
        common_elements = set(ground_truth) & set(pred)
        n_common_elements.append(len(common_elements))
    
    return n_common_elements

def compute_score(result_dict, score_type="rouge"):
    result_dict_cp = copy.deepcopy(result_dict)
    
    ground_truth = result_dict_cp.get('ground_truth', [])
    assert(len(ground_truth) > 0)
    
    preds = result_dict_cp.get('preds', [])
    preds = [pred for pred in preds if pred != '']
    assert(len(preds) > 0)
    
    scores = []
    
    for pred in preds:
        if score_type == "rouge":
            score = calculate_rouge(pred, ' '.join(ground_truth))
            # scores.append(score[0]['rouge-l']['f'])
            scores.append(score[0]['rouge-1']['r'])
        
        elif score_type == "bleu":
            score = calculate_bleu(pred, ground_truth)
            scores.append(score)
        
        elif score_type == "meteor":
            score = calculate_meteor(pred, ground_truth)
            scores.append(score)
        else:
            raise ValueError("Invalid score type")
            
    return scores

In [None]:
test_n_common_elements = get_n_common_elements(results[0])

In [None]:
def compute_relevancy_scores(n_common_elements, len_ground_truth):
    return [n/len_ground_truth for n in n_common_elements]    

In [None]:
test_relevancy_scores = compute_relevancy_scores(test_n_common_elements, len(results[0].get('ground_truth')))
test_relevancy_scores

In [None]:
import math

In [None]:
# def compute_dcg(relevancy_scores):
#     dcg = []
#     for idx, val in enumerate(relevancy_scores): 
#         # relevance score
#         numerator = math.pow(2, val)-1
#         # numerator = val
#         # add 2 because python 0-index
#         denominator =  np.log2(idx + 2) 
#         score = numerator/denominator
#         dcg.append(score)
#     return sum(dcg)
def compute_dcg(relevance, alternate=True):
    if relevance is None or len(relevance) < 1:
        return 0.0

    rel = np.asarray(relevance)
    p = len(rel)

    if alternate:
        # from wikipedia: "An alternative formulation of
        # DCG[5] places stronger emphasis on retrieving relevant documents"

        log2i = np.log2(np.asarray(range(1, p + 1)) + 1)
        return ((np.power(2, rel) - 1) / log2i).sum()
    else:
        log2i = np.log2(range(2, p + 1))
        return rel[0] + (rel[1:] / log2i).sum()

In [None]:
compute_dcg(test_relevancy_scores)

In [None]:
compute_dcg([3,2,3,0,1,2], False)

In [None]:
def lcs_length(a, b):
    table = [[0] * (len(b) + 1) for _ in range(len(a) + 1)]
    for i, ca in enumerate(a, 1):
        for j, cb in enumerate(b, 1):
            table[i][j] = (
                table[i - 1][j - 1] + 1 if ca == cb else
                max(table[i][j - 1], table[i - 1][j]))
    return table[-1][-1]

In [None]:
results[1]['ground_truth']

In [None]:
results[1]['preds'][1].split()

In [None]:
lcs_length(results[0]['ground_truth'], results[0]['preds'][-1].split())

In [None]:
def get_common_subseq(result_dict):
    result_dict_cp = copy.deepcopy(result_dict)
    
    ground_truth = result_dict_cp.get('ground_truth', [])
    assert(len(ground_truth) > 0)
    
    preds = result_dict_cp.get('preds', [])
    preds = [pred for pred in preds if pred != '']
    assert(len(preds) > 0)
    
    n_common_subseq = []
    
    for pred in preds:
        pred = pred.split()
        common_subseq = lcs_length(pred, ground_truth)
        n_common_subseq.append(common_subseq)
    
    return n_common_subseq

In [None]:
compute_relevancy_scores(get_common_subseq(results[1]), 5)

In [None]:
compute_dcg(compute_relevancy_scores(get_common_subseq(results[1]), 5))

In [None]:
def ndcg(results, topk, score_type):
    ndcg_list = []
    for result_dict in results:
        len_ground_truth = len(result_dict.get('ground_truth', []))
        assert(len_ground_truth>0)
        
        ideal_scores = [1 for x in range(topk)]
        if score_type == "common-intersection":
            n_common_elements = get_n_common_elements(result_dict)[:topk].copy()
            relevancy_scores = compute_relevancy_scores(n_common_elements, len_ground_truth)
            dcg_score = compute_dcg(relevancy_scores)
            
        elif score_type == "common-subsequence":
            n_common_elements = get_common_subseq(result_dict)[:topk].copy()
            relevancy_scores = compute_relevancy_scores(n_common_elements, len_ground_truth)
            dcg_score = compute_dcg(relevancy_scores)
            
        
        elif score_type in ("rouge", "bleu", "meteor"):
            relevancy_scores = compute_score(result_dict, score_type=score_type)[:topk].copy()
            dcg_score = compute_dcg(relevancy_scores)
        
        idcg_score = compute_dcg(ideal_scores)
        ndcg_scores = dcg_score/idcg_score
        ndcg_list.append(ndcg_scores)
        
    return sum(ndcg_list)/len(ndcg_list)

In [None]:
def ndcg_topk(results, topk_list, score_type):
    ndcg_dict = {}
    for k in topk_list:
        average_dcg = ndcg(results, k, score_type)
        ndcg_dict[k] = round(average_dcg, 3)
    return ndcg_dict

In [None]:
topk_list = [1, 5, 10]
ndcg_topk(results, topk_list, "common-subsequence")

In [None]:
topk_list = [1, 5, 10]
ndcg_topk(results, topk_list, "bleu")

# compute for all model

In [None]:
def print_scores(dcg_dict):
    for model, score_types in dcg_dict.items():
        print(model)
        for score_type, dcg_scores in score_types.items():
            print(score_type)
            print("-------"*5)
            for k, score in dcg_scores.items():
                print(f"{k}_{score_type}:{score}")
            print()
        print()
        print()
        print()

In [None]:
model_to_path = {
    'deep_api': 'output_inference/deep_api_all_100/results_ndcg.json',
    'codebert': 'output_inference/codebert2codebert_all_100/results_ndcg.json',
    'codet5': 'output_inference/codet5_all_100/results_ndcg.json',
    'plbart': 'output_inference/plbart_all_100/results_ndcg.json'
}

In [None]:
ndcg_dict = {}
topk = (1, 5, 10)
score_types = ("rouge", "bleu", "meteor", "common-subsequence")

for score_type in score_types:
    for model, path in model_to_path.items():
        
        if model not in ndcg_dict:
            ndcg_dict[model] = {}
        
        with open(path, 'r') as f:
            results = json.load(f)

        ndcg_dict[model][score_type] = ndcg_topk(results, topk, score_type)

In [None]:
def generate_df(ndcg_dict):
    models = []
    top1_scores = []
    top5_scores = []
    top10_scores = []
    types_ = []
    
    for model, score_types in ndcg_dict.items():
        for score_type, dcg_scores in score_types.items():
            top1_scores.append(dcg_scores[1])
            top5_scores.append(dcg_scores[5])
            top10_scores.append(dcg_scores[10])
            models.append(model)
            types_.append(score_type)
    
    return pd.DataFrame({
        'model': models,
        'score_type': types_,
        'k_1': top1_scores,
        'k_5': top5_scores,
        'k_10': top10_scores
    })
            

In [None]:
df = generate_df(ndcg_dict)

In [None]:
df.to_csv("results_ndcg.csv", index=False)

In [None]:
df