# Evaluation of summaries (true, pred)

### Imports

In [1]:
import torch
from pathlib import Path
from tqdm import tqdm
from enum import Enum
from evaluate import load  # https://huggingface.co/evaluate-metric
import pandas as pd
import json
import requests
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from sentence_transformers import util
from huggingface_hub import configure_http_backend
def backend_factory() -> requests.Session:
    session = requests.Session()
    session.proxies = {"https": "http://34e9515e90e14e90:9d35c556ec546bc6@135.181.81.30:3128"}
    # session.verify = False
    return session
configure_http_backend(backend_factory=backend_factory)

2024-05-28 13:27:45.696000: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 13:27:45.749800: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-28 13:27:45.749859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-28 13:27:45.751371: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-28 13:27:45.761520: I tensorflow/core/platform/cpu_feature_guar

In [None]:
side_model_dir = Path("../models/side")
side_tokenizer = AutoTokenizer.from_pretrained(side_model_dir)
side_model = AutoModel.from_pretrained(side_model_dir).to("cuda:0")

### Define constants

In [4]:
DATA_DIR = Path("../data")

PREDICTED_DATA_DIR = DATA_DIR / "predicted"
model_names = [
    "SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune",
    "SEBIS/code_trans_t5_large_code_documentation_generation_python_multitask_finetune",
    "Salesforce/codet5-base-multi-sum",
    "Paul-B98/codet5p_220m_py_sum",
    "lintang/pile-t5-large-codexglue",
    "deepseek-ai/deepseek-coder-6.7b-instruct",
    "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
]
MODEL_NAME = model_names[-1]
MODEL_RESULTS_DIR = PREDICTED_DATA_DIR / MODEL_NAME.split("/")[-1]

LEVEL = 'method'
DATASET = 'mce'

### Print all results

In [5]:
class Metrics(Enum):
    ROUGE = 'ROUGE-L'
    BLEU = 'BLEU-4'
    METEOR = 'METEOR'
    BERTScore = 'BERTScore'
    BLEURT = 'BLEURT'
    SIDE_TRUE = 'SIDE_true'
    SIDE_PRED = 'SIDE_pred'

In [None]:
dfs = []
for model_dir in PREDICTED_DATA_DIR.glob('*'):
    for file_path in model_dir.glob('*eval.json'):
        with file_path.open('r') as file:
            data = json.load(file)
            if 'mcsn' in str(file_path):
                for key in data.keys():
                    sub_data = data[key]
                    sub_data['Name'] = '/'.join(str(file_path).split('/')[-2:]) + '/' + key
                    dfs.append(pd.DataFrame([sub_data]))
            else:
                data['Name'] = '/'.join(str(file_path).split('/')[-2:])
                dfs.append(pd.DataFrame([data]))

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.set_index('Name', inplace=True)
markdown_table = combined_df.to_markdown()
print(markdown_table)

combined_df.to_csv(Path('../data/predicted/results.csv'), sep=',', index=True)

### Read data

In [None]:
file_path = MODEL_RESULTS_DIR / f"{LEVEL}-level-{DATASET}-pred.jsonl"
print(file_path)
df = pd.read_json(file_path, lines=True)
df.head()

../data/predicted/code_trans_t5_large_source_code_summarization_python_multitask_finetune/method-level-mcsn-pred.jsonl


Unnamed: 0,repo_name,method_name,method_code,method_summary,pred_summary
0,apache/airflow,HttpHook.run,"def run(self, endpoint, data=None, headers=Non...",Performs the request,Sending a HTTP request to a Pylons service
1,apache/airflow,HttpHook.check_response,"def check_response(self, response):\n t...",Checks the status code and raise an AirflowExc...,Raise AirflowException on non 200 status codes
2,apache/airflow,HttpHook.run_and_check,"def run_and_check(self, session, prepped_reque...",Grabs extra options like timeout and actually ...,Deploying a service in Tenacity
3,apache/airflow,create_session,def create_session():\n session = settings....,Contextmanager that will create and teardown a...,A context manager for SQLAlchemy with a simple...
4,apache/airflow,resetdb,def resetdb():\n from airflow import models...,Clear out the database,Drop tables that exist in the database


In [None]:
file_path = MODEL_RESULTS_DIR / f"{LEVEL}-level-sample-{DATASET}-few-shot-0-pred.jsonl"
print(file_path)
df = pd.read_json(file_path, lines=True)
df.head()

../data/predicted/Llama-3-8B-Instruct-Gradient-1048k/method-level-sample-mce-few-shot-0-pred.jsonl


Unnamed: 0,index,class_id,class_code,skeleton,method_code,method_summary,pred_summary
0,0,ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,import logging\nimport datetime\n\nclass Acces...,"def filter(self, request):\n request_ur...",Filter the incoming request based on certain r...,The `filter` method takes in a request object ...
1,1,ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,import logging\nimport datetime\n\nclass Acces...,"def is_start_with(self, request_uri):\n ...",Check if the request URI starts with certain p...,The `is_start_with` method takes a `request_ur...
2,2,ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,import logging\nimport datetime\n\nclass Acces...,"def get_jwt_user(self, request):\n toke...",Get the user information from the JWT token in...,The code retrieves the user from the JWT token...
3,3,ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,import logging\nimport datetime\n\nclass Acces...,"def set_current_user_info_and_log(self, user):...",Set the current user information and log the a...,The code sets the current user information and...
4,4,ClassEval_1_sum,"import math\n\n\nclass AreaCalculator:\n """"...","import math\nclass AreaCalculator:\n """"""\n ...",def calculate_circle_area(self):\n retu...,calculate the area of circle based on self.radius,The Python code defines a method called `calcu...


#### Extract summaries

In [None]:
true_column_name = 'method_summary'
pred_column_name = 'pred_summary'

In [None]:
true_summaries = df[true_column_name].to_list()
pred_summaries = df[pred_column_name].to_list()
assert len(true_summaries) == len(pred_summaries)

### Load metrics

In [None]:
for metric in Metrics:
    df[metric.value] = None

In [None]:
def compute_side(tokenizer, model, code, summary):
    def _mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    pair = [code, summary]
    encoded_input = tokenizer(pair, padding=True, truncation=True, return_tensors='pt').to('cuda')

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = _mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    sim = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]).item()
    return sim

def compute_side_partial(args):
    return compute_side(side_tokenizer, side_model, *args)

side_model_dir = Path("../models/side")
side_tokenizer = AutoTokenizer.from_pretrained(side_model_dir)
side_model = AutoModel.from_pretrained(side_model_dir).to('cuda')

In [None]:
bleurt = load('bleurt', 'BLEURT-20', module_type="metric")

INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/cd1c38739d180ae53192201859a058307621534b704c20700072eca17d748c58/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: /root/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/cd1c38739d180ae53192201859a058307621534b704c20700072eca17d748c58/BLEURT-20/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer cr

2024-05-12 00:38:45.375621: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 11779 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:17:00.0, compute capability: 8.0
2024-05-12 00:38:45.376919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 76523 MB memory:  -> device: 1, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:31:00.0, compute capability: 8.0
2024-05-12 00:38:45.378074: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 76523 MB memory:  -> device: 2, name: NVIDIA A100 80GB PCIe, pci bus id: 0000:b1:00.0, compute capability: 8.0
2024-05-12 00:38:45.379219: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 75813 MB memory:  -> device: 3, name: NVIDIA A100 80GB PCIe, pci bu

INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [None]:
bertscore = load('bertscore')

In [None]:
bleu = load('bleu')
rouge = load('rouge')
meteor = load('meteor')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Compute metrics

In [None]:
tqdm.pandas()
df[Metrics.SIDE_TRUE.value] = df[['method_code', true_column_name]].progress_apply(compute_side_partial, axis=1)
df[Metrics.SIDE_PRED.value] = df[['method_code', pred_column_name]].progress_apply(compute_side_partial, axis=1)

100%|██████████| 400/400 [00:18<00:00, 21.39it/s]
100%|██████████| 400/400 [00:27<00:00, 14.71it/s]


In [None]:
bleurt_res = bleurt.compute(predictions=pred_summaries, references=true_summaries)
df[Metrics.BLEURT.value] = bleurt_res['scores']

In [None]:
bertscore_res = bertscore.compute(
    predictions=pred_summaries,
    references=true_summaries,
    model_type='microsoft/deberta-xlarge-mnli',
    device=torch.device("cuda:1")
)
df[Metrics.BERTScore.value] = bertscore_res['f1']

In [None]:
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    true_summary = row[true_column_name]
    pred_summary = row[pred_column_name]

    bleu_res = bleu.compute(predictions=[pred_summary], references=[[true_summary]])
    rouge_res = rouge.compute(predictions=[pred_summary], references=[true_summary])
    meteor_res = meteor.compute(predictions=[pred_summary], references=[true_summary])

    df.at[index, Metrics.ROUGE.value] = rouge_res['rougeL']
    df.at[index, Metrics.BLEU.value] = bleu_res['bleu']
    df.at[index, Metrics.METEOR.value] = meteor_res['meteor']

100%|██████████| 400/400 [00:41<00:00,  9.54it/s]


In [None]:
df.head(1)

Unnamed: 0,index,class_id,class_code,skeleton,method_code,method_summary,pred_summary,ROUGE-L,BLEU-4,METEOR,BERTScore,BLEURT,SIDE_true,SIDE_pred
0,0,ClassEval_0_sum,import logging\nimport datetime\n\n\nclass Acc...,import logging\nimport datetime\n\nclass Acces...,"def filter(self, request):\n request_ur...",Filter the incoming request based on certain r...,The `filter` method takes in a request object ...,0.106383,0.0,0.153846,0.541831,0.500012,0.101102,0.769831


### Save scores

In [None]:
df = df.reset_index()
df.to_json(MODEL_RESULTS_DIR / f"{LEVEL}-level-{DATASET}-pred-metrics.jsonl", orient='records', lines=True)

In [None]:
df = df.reset_index()
df.to_json(MODEL_RESULTS_DIR / f"{LEVEL}-level-sample-{DATASET}-few-shot-0-pred-metrics.jsonl", orient='records', lines=True)

In [None]:
if DATASET != "mcsn":
    total_avg_metrics = {metric.value: df[metric.value].mean() for metric in Metrics}
else:
    total_avg_metrics = {
        "total": {metric.value: df[metric.value].mean() for metric in Metrics}
    }
    repo_avg_metrics = {
        repo_name: {
            metric.value: df.loc[df["repo_name"] == repo_name, metric.value].mean()
            for metric in Metrics
        }
        for repo_name in df["repo_name"].unique()
    }
    total_avg_metrics.update(repo_avg_metrics)


eval_output_file_path = MODEL_RESULTS_DIR / f"{LEVEL}-level-sample-{DATASET}-few-shot-0-eval.json"
with open(eval_output_file_path, "w") as file:
    json.dump(total_avg_metrics, file, indent=4)