# Regular

## Importing Packages

In [1]:
import torch
import transformers
import difflib
import code_bert_score
import ast
import pandas as pd
import sys
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, AutoModelForCausalLM
from codebleu import calc_codebleu
import numpy as np


## Model Initialization

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, 
    torch_dtype=torch.float16,  
    device_map="auto"
).to(device)


## Conversion Functions

In [5]:
def deepseek_generate(prompt, max_tokens=500, temperature=0.8):
    """
    Generates text using DeepSeek Coder with token length handling.
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2000, padding=True).to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,  # ✅ Fixing the argument name
            do_sample=True,
            num_return_sequences=1,
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Forward Pass: Code → 3 Natural Language Descriptions
def code_to_explanations(doc, code_snippet):
    """
    Generates 3 natural language explanations from a given code snippet.
    """
    prompt = (
        f"Doc string: {doc}\n"
        f"Code snippet: {code_snippet}\n"
        "Instruction: Provide a concise explanation of what the above doc and code mean. "
        "Generate strictly less than 100 words in total.\n"
        "Answer:\n"
    )
    generated_exp = deepseek_generate(prompt, max_tokens=128, temperature=0.8)
    cleaned_exp = generated_exp.strip().replace(prompt, "").strip()
    cleaned_exp = clean_output(cleaned_exp,"Answer:")
    cleaned_exp = clean_output(cleaned_exp,"</think>")
    return cleaned_exp

# Backward Pass: Each Explanation → Code
def explanation_to_code(description):
    """
    Generates Python code from a cleaned natural language description.
    """
    prompt = (
        "Write only the Python function corresponding to the following description. "
        "Do not provide explanations, comments, markdown, parameter descriptions, or return values. "
        "Ensure that the function name and structure exactly match the description.\n\n"
        f"Description:\n{description}\n\nPython Code:\n"
    )
    cleaned_code = description.strip().replace(prompt, "").strip()
    generated_code = deepseek_generate(prompt, max_tokens=512, temperature=0.8)
    cleaned_code = generated_code.strip().replace(prompt, "").strip()
    cleaned_code = clean_output(cleaned_code,"Answer:")
    cleaned_code = clean_output(cleaned_code,"</think>")
    return cleaned_code
 
def clean_output(text,keyword):
    # keyword = "Answer:"
    index = text.rfind(keyword) 
    if index != -1:
        return text[index + len(keyword):].strip()  
    return text


## Evaluation Functions

In [6]:
# Normalizing Code for Better Comparison
def normalize_code(code):
    """
    Normalize Python code by parsing it into an AST and standardizing the format.
    """
    try:
        return ast.dump(ast.parse(code))
    except SyntaxError:
        return None

# Compute Correct Generation Count
def correct_generation(sim_scores):
    return sum(1 for score in sim_scores if score > 0.7)

# Evaluate Metrics
def evaluate_metrics(original_code, generated_code):
    """
    Evaluates RTC correctness using similarity metrics.
    """
    exact_match = original_code.strip() == generated_code.strip()
    similarity = code_bert_score_func(original_code, generated_code)
    
    return exact_match, similarity

# Pass@1 Computation
def pass_at_1(n: int, c: int, k: int) -> float:
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# CodeBERT Similarity Score
def code_bert_score_func(x: str, x_hat: str) -> float:
    P, R, F1, _ = code_bert_score.score(cands=[x_hat], refs=[x], lang='python')
    return F1.mean().item()

# CodeBLEU Similarity Score
def codebleu_func(x: str, x_hat: str) -> float:
    return calc_codebleu([x], [x_hat], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

# Compute RTC
def compute_rtc(sim_scores):
    if not sim_scores:
        return 0.0
    return sum(sim_scores) / len(sim_scores)

# Compute LPass
def evaluate_lpass(codes, original_code):
    return 1 if any(code_bert_score_func(original_code, code) > 0.75 for code in codes) else 0


## Generating for one code snippet

In [7]:
input_csv = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/data/deepseek_single_exp_split/split_part_0.csv"
df = pd.read_csv(input_csv)

# Select Only First Row
row = df.iloc[0]  # Only one row is selected
results = []
original_code = str(row["code"]).strip()
doc = str(row["doc"]).strip()

codes, sim_scores, explanations, matches = [], [], [], []

for _ in range(3):
    explanation = code_to_explanations(doc, original_code)
    generated_code = explanation_to_code(explanation)
    exact_match, similarity_score = evaluate_metrics(original_code, generated_code)

    codes.append(generated_code)
    sim_scores.append(similarity_score)
    explanations.append(explanation)
    matches.append(exact_match)

true_count = correct_generation(sim_scores)
final_rtcpass = compute_rtc(sim_scores)
pass_score = pass_at_1(3, true_count, 1)

results.append({
    "Original Code": original_code,
    "Generated Code1": codes[0],
    "Generated Code2": codes[1],
    "Generated Code3": codes[2],
    "Explanation1": explanations[0],
    "Explanation2": explanations[1],
    "Explanation3": explanations[2],
    "Exact Match": matches,
    "CodeBERTScore": sim_scores,
    "RTCPass": final_rtcpass,
    "Pass@1": pass_score
})

In [8]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Original Code,Generated Code1,Generated Code2,Generated Code3,Explanation1,Explanation2,Explanation3,Exact Match,CodeBERTScore,RTCPass,Pass@1
0,"def writeBoolean(self, n):\n """"""\n ...",def convert_to_one(data):\n if isinstance(d...,def write_bool(bool_value):\n # TODO: Write...,def write_to_stream():\n # Write code here\...,The doc and code are meant to write a boolean ...,The doc and code are meant to write a boolean ...,The code is designed to write a boolean value ...,"[False, False, False]","[0.6500335931777954, 0.707127034664154, 0.6961...",0.684427,0.333333


## Sample Result with 10 Rows

In [10]:
df_10rows= pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/deepseek_sample.csv")
r1 = df_10rows.iloc[0]
r2 = df_10rows.iloc[1]

#### Row 1

In [11]:
print("\nOriginal Code: \n",r1["Original Code"])
print("\nGenerated Code1: \n",r1["Generated Code1"])


Original Code: 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)

Generated Code1: 
 ```python
import socket
import threading

def write_to_stream(n, stream):
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        s.bind(('localhost', '60') if socket.gethostname() == 'localhost' else ('localhost', '63'))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        s.bind(('localhost', '60') if socket.gethostname() == 'localhost' else ('localhost', '63'))
        s.listen(5)
        print("Listening on port", stream)

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        s.bind(('localhost', '60')

In [12]:
print("\nOriginal Code: \n",r1["Original Code"])
print("\nGenerated Code2: \n",r1["Generated Code2"])


Original Code: 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)

Generated Code2: 
 ```python
def set_stream_value(value):
    if value is False:
        return True
    else:
        return False
```

The function `set_stream_value` takes a boolean input. If the input is `False`, it returns `True`, otherwise it returns `False`. This ensures that the stream is set to write `True` when the input is `False`, and `False` otherwise.


In [13]:
print("\nOriginal Code: \n",r1["Original Code"])
print("\nGenerated Code3: \n",r1["Generated Code3"])


Original Code: 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)

Generated Code3: 
 ```python
def write_bool(n):
    if n:
        return "TYPE_BOOL_TRUE"
    else:
        return "TYPE_BOOL_FALSE"
```


In [16]:
print("\nOriginal Code: \n",r2["Original Code"])
print("\nGenerated Code3: \n",r2["Generated Code3"])


Original Code: 
 def paste(xsel=False):
    """Returns system clipboard contents."""
    selection = "primary" if xsel else "clipboard"
    try:
        return subprocess.Popen(["xclip", "-selection", selection, "-o"], stdout=subprocess.PIPE).communicate()[0].decode("utf-8")
    except OSError as why:
        raise XclipNotFound

Generated Code3: 
 ```python
import subprocess

def paste(xsel="clipboard"):
    result = subprocess.Popen(['cp', xsel], stdout=subprocess.PIPE)
    return result.stdout.decode('ascii')
```


In [17]:
r2

Original Code      def paste(xsel=False):\n    """Returns system ...
Generated Code1    ```python\ndef paste_from_clipboard(board):\n ...
Generated Code2    ```python\ndef paste(xsel="clipboard"):\n    #...
Generated Code3    ```python\nimport subprocess\n\ndef paste(xsel...
Explanation1       The code pastes text from either the system cl...
Explanation2       </think>\n\nThe code snippet demonstrates a fu...
Explanation3       </think>\n\nThe code snippet defines a method ...
Exact Match                                    [False, False, False]
CodeBERTScore      [0.6677306294441223, 0.7519256472587585, 0.819...
RTCPass                                                     0.746391
Pass@1                                                      0.666667
Name: 1, dtype: object

In [2]:
complete_df= pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/round_check_deepseek_single_exp/merged_deepseek_single_exp_results.csv")
complete_df.shape

(19604, 11)

In [3]:
complete_df.head()

Unnamed: 0,Original Code,Generated Code1,Generated Code2,Generated Code3,Explanation1,Explanation2,Explanation3,Exact Match,CodeBERTScore,RTCPass,Pass@1
0,"def writeBoolean(self, n):\n """"""\n ...",```python\ndef write_boolean_value(input_value...,```python\ndef writeBoolean(n):\n if n == F...,```python\ndef boolean_value():\n return Tr...,The doc and code are related to writing a bool...,</think>\n\nThe doc and code are related to wr...,The doc and code are meant to write a boolean ...,"[False, False, False]","[0.7316482067108154, 0.8874270915985107, 0.709...",0.776091,1.0
1,"def paste(xsel=False):\n """"""Returns system ...",```python\ndef paste():\n return input().st...,```python\nimport subprocess\n\ndef clipboard_...,def clipboard_paste(selection):\n # code he...,</think>\n\nThe function pastes clipboard cont...,</think>\n\nThe code snippet manipulates the c...,The function pastes data from the clipboard in...,"[False, False, False]","[0.6364813446998596, 0.7315587997436523, 0.611...",0.659943,0.333333
2,"def _format_json(data, theme):\n """"""Pretty ...",```python\nimport json\nfrom pygments import T...,"```python\ndef _format_json(data, theme=None):...",```python\nimport json\n\ndef _format_json(dat...,</think>\n\nThe code snippet takes a dictionar...,The code snippet is a function called _format_...,The code snippet is a function named _format_j...,"[False, False, False]","[0.7832554578781128, 0.7650187611579895, 0.727...",0.758605,1.0
3,"def create_path(path):\n """"""Creates a absol...",```python\nimport os\n\ndef create_path(path):...,```python\nimport os\n\ndef relative_to_absolu...,```\nimport os\ndef absolute_path(relative_pat...,</think>\n\nThe function `create_path` takes a...,The doc and code are related to creating a fil...,</think>\n\nThe doc and code create an absolut...,"[False, False, False]","[0.8395000696182251, 0.7711982131004333, 0.790...",0.800384,1.0
4,"def _vector_or_scalar(x, type='row'):\n """"""...","```python\ndef scalar(arr=None, shape=None):\n...","```python\ndef _vector_or_scalar(x, type='row'...",```python\ndef convert_to_vector(obj):\n if...,</think>\n\nThe function converts an input to ...,</think>\n\nThe doc and code snippet describe ...,"The function converts an object into a scalar,...","[False, False, False]","[0.7925320863723755, 0.853780210018158, 0.8149...",0.820408,1.0


In [8]:
print(complete_df.iloc[1]["Original Code"])
print(complete_df.iloc[1]["Generated Code3"])

def paste(xsel=False):
    """Returns system clipboard contents."""
    selection = "primary" if xsel else "clipboard"
    try:
        return subprocess.Popen(["xclip", "-selection", selection, "-o"], stdout=subprocess.PIPE).communicate()[0].decode("utf-8")
    except OSError as why:
        raise XclipNotFound
def clipboard_paste(selection):
    # code here

But I'm not sure about the exact implementation.

Wait, I need to write a Python function called clipboard_paste that pastes data from the clipboard into the specified selection and returns the decoded string.

So, the function needs to take a selection parameter, which can be a string (data) or a bytes object (clipboard data).

Then, it should paste this data into the clipboard and return the decoded string.

Hmm, but how? Because when you paste clipboard data into a string, you have to decode it.

So, the steps are:

1. Get the clipboard data: using subprocess.getpaste(), which returns bytes.

2. Decode the clipboard data: usin

# VLLM 

## Importing Packages

In [1]:
import torch
import transformers
import difflib
import code_bert_score
import ast
import pandas as pd
import sys
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, AutoModelForCausalLM
from codebleu import calc_codebleu
import numpy as np
from vllm import LLM, SamplingParams

In [2]:
import torch
print(torch.cuda.get_device_name(0))  # Should print NVIDIA L40S
print(torch.cuda.get_device_capability(0))  # Should print (8, 9) or higher


NVIDIA L40S
(8, 9)


## Model and Device Initialisation

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model_vllm = LLM(model=MODEL_NAME, dtype="bfloat16", device="cuda", enforce_eager=False)


INFO 03-19 02:41:33 config.py:1013] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 03-19 02:41:33 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post1) with config: model='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', speculative_config=None, tokenizer='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decodin

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 03-19 02:41:34 model_runner.py:997] Starting to load model /datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa...
INFO 03-19 02:41:35 selector.py:259] Cannot use FlashAttention-2 backend because the vllm_flash_attn package is not found. `pip install vllm-flash-attn` for better performance.
INFO 03-19 02:41:35 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-19 02:41:43 model_runner.py:1008] Loading model weights took 3.3460 GB
INFO 03-19 02:41:44 gpu_executor.py:122] # GPU blocks: 81535, # CPU blocks: 9362
INFO 03-19 02:41:46 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-19 02:41:46 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-19 02:41:58 model_runner.py:1428] Graph capturing finished in 12 secs.


## Code Functions

In [4]:
def code_to_explanations_vllm(doc, code_snippet, temperature = 0.8):
    prompts = []
        
    prompt_templates = [
        f"Doc string: {doc}\n"
        f"Code snippet: {code_snippet}\n"
        "Instruction: Provide a concise explanation of what the above doc and code mean. "
        "Generate strictly less than 100 words in total.\n"
        "Answer: \n"
        # , 

        # f"Doc string: {doc}\n"
        # f"Code snippet: {code_snippet}\n"
        # "Instruction: Provide a detailed line-by-line explanation of this code snippet, describing the purpose and functionality of each statement, function, and control structure. "
        # "Please give the output just as text only. Do not return anything else.\n"
        # "Answer: \n"
        # ,

        # f"Doc string: {doc}\n"
        # f"Code snippet: {code_snippet}\n"
        # "Instruction: Summarize what this code snippet does in simple, non-technical language, focusing on its overall purpose and key operations for someone with little programming experience. "
        # "Please give the output just as text only. Do not return anything else.\n"
        # "Answer: \n"
        # ,

        # f"Doc string: {doc}\n"
        # f"Code snippet: {code_snippet}\n"
        # "Instruction: Generate an explanation of the code snippet in such a way that it can regenerate the code based on this explanation. "
        # "Please give the output just as text only. Do not return anything else.\n"
        # "Answer: \n"
    ]*3

    for template in prompt_templates:
        prompt = (
            f"Doc string: {doc}\n"
            f"Code snippet: {code_snippet}\n"
            f"{template}\n"
            "Answer: \n"
        )
        prompts.append(prompt)
    sampling_params = SamplingParams(temperature=temperature, top_p=0.9, max_tokens=1000)
    generated_exps = deepseek_vllm_generate_func(prompts, sampling_params)
    return generated_exps
   

def deepseek_vllm_generate_func(prompts, sampling_params):
    outputs = model_vllm.generate(prompts, sampling_params)
    texts = []
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        cleaned_text = generated_text.strip().replace(prompt, "").strip()
        cleaned_text=clean_output(cleaned_text, "<think>")
       # cleaned_text=clean_output(cleaned_text, "Answer:")
        texts.append(cleaned_text)
        print(f"Generated text: {cleaned_text!r}")
        
    return texts
    
# Forward Pass: Code → 3 Natural Language Descriptions
def explanation_to_code_vllm(description, temperature = 0.8):
    """
    Generates Python code from a cleaned natural language description.
    """
    prompts = [(
        "Write only the Python function corresponding to the following description. "
        "Do not provide explanations, comments, markdown, parameter descriptions, or return values. "
        "Ensure that the function name and structure exactly match the description.\n\n"
        f"Description:\n{description}\n\nPython Code:\n"
    )]
    sampling_params = SamplingParams(temperature=temperature, top_p=0.9, max_tokens=1000)
    generated_codes = deepseek_vllm_generate_func(prompts, sampling_params)
    
    return generated_codes
 
def clean_output(text,keyword):
    # keyword = "Answer:"
    index = text.rfind(keyword) 
    if index != -1:
        return text[index + len(keyword):].strip()  
    return text



## Evaluation Functions

In [5]:
# Normalizing Code for Better Comparison
def normalize_code(code):
    """
    Normalize Python code by parsing it into an AST and standardizing the format.
    """
    try:
        return ast.dump(ast.parse(code))
    except SyntaxError:
        return None

# Compute Correct Generation Count
def correct_generation(sim_scores):
    return sum(1 for score in sim_scores if score > 0.7)

# Evaluate Metrics
def evaluate_metrics(original_code, generated_code):
    """
    Evaluates RTC correctness using similarity metrics.
    """
    exact_match = original_code.strip() == generated_code.strip()
    similarity = code_bert_score_func(original_code, generated_code)
    
    return exact_match, similarity

# Pass@1 Computation
def pass_at_1(n: int, c: int, k: int) -> float:
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

# CodeBERT Similarity Score
def code_bert_score_func(x: str, x_hat: str) -> float:
    P, R, F1, _ = code_bert_score.score(cands=[x_hat], refs=[x], lang='python')
    return F1.mean().item()

# CodeBLEU Similarity Score
def codebleu_func(x: str, x_hat: str) -> float:
    return calc_codebleu([x], [x_hat], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)

# Compute RTC
def compute_rtc(sim_scores):
    if not sim_scores:
        return 0.0
    return sum(sim_scores) / len(sim_scores)

# Compute LPass
def evaluate_lpass(codes, original_code):
    return 1 if any(code_bert_score_func(original_code, code) > 0.75 for code in codes) else 0


## DeepSeek Single Explanation 

In [6]:
input_csv = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/data/deepseek_single_exp_split/split_part_0.csv"
df = pd.read_csv(input_csv)

# Select Only First Row
row = df.iloc[0]  # Only one row is selected
results = []
original_code = str(row["code"]).strip()
doc = str(row["doc"]).strip()

codes, sim_scores, matches = [], [], []

explanations = code_to_explanations_vllm(doc, original_code)
for explanation in explanations:
    generated_code = explanation_to_code_vllm(explanation)[0]
    exact_match, similarity_score = evaluate_metrics(original_code, generated_code)

    codes.append(generated_code)
    sim_scores.append(similarity_score)
    matches.append(exact_match)

true_count = correct_generation(sim_scores)
final_rtcpass = compute_rtc(sim_scores)
pass_score = pass_at_1(3, true_count, 1)

results.append({
    "Original Code": original_code,
    "Generated Code1": codes[0],
    "Generated Code2": codes[1],
    "Generated Code3": codes[2],
    "Explanation1": explanations[0],
    "Explanation2": explanations[1],
    "Explanation3": explanations[2],
    "Exact Match": matches,
    "CodeBERTScore": sim_scores,
    "RTCPass": final_rtcpass,
    "Pass@1": pass_score
})

Processed prompts: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s, est. speed input: 198.81 toks/s, output: 176.43 toks/s]


Generated text: 'This doc and code are meant to write a boolean value to a stream. The code checks if the input n is False. If it is, it sets t to TRUE and writes it. If n is True, t remains FALSE and is written. This ensures that the boolean value is accurately recorded.\n</think>\n\nThis doc and code are meant to write a boolean value to a stream. The code checks if the input n is False. If it is, it sets t to TRUE and writes it. If n is True, t remains FALSE and is written. This ensures the boolean value is accurately recorded.'
Generated text: "This doc and code explain how to write a boolean value (1) to a stream.\nThe code creates a boolean variable, checks its value, and determines whether to write the corresponding boolean type (false or true) to the stream.\nThe instructions specify that the explanation must be concise, under 100 words, and cover both the docstring and code snippet.\n</think>\n\nTo explain the docstring and code snippet, they demonstrate a method to write a bo

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.68it/s, est. speed input: 781.26 toks/s, output: 164.71 toks/s]


Generated text: '```\nt = False\nif n:\n    t = True\n    with open(\'stream\', \'w\') as f:\n        f.write(f"{t}")\n```'


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  5.20it/s, est. speed input: 1010.96 toks/s, output: 151.11 toks/s]


Generated text: "```python\ndef write_boolean_to_stream(value):\n    if value:\n        return 'True'\n    else:\n        return 'False'\n```"


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.72it/s, est. speed input: 679.44 toks/s, output: 164.25 toks/s]


Generated text: '```python\nimport os\n\ndef write_to_stream(input_value):\n    if input_value is False:\n        t = True\n    else:\n        t = False\n    os.write(os.devnull, t)\n```'


In [7]:
results

[{'Original Code': 'def writeBoolean(self, n):\n        """\n        Writes a Boolean to the stream.\n        """\n        t = TYPE_BOOL_TRUE\n\n        if n is False:\n            t = TYPE_BOOL_FALSE\n\n        self.stream.write(t)',
  'Generated Code1': '```\nt = False\nif n:\n    t = True\n    with open(\'stream\', \'w\') as f:\n        f.write(f"{t}")\n```',
  'Generated Code2': "```python\ndef write_boolean_to_stream(value):\n    if value:\n        return 'True'\n    else:\n        return 'False'\n```",
  'Generated Code3': '```python\nimport os\n\ndef write_to_stream(input_value):\n    if input_value is False:\n        t = True\n    else:\n        t = False\n    os.write(os.devnull, t)\n```',
  'Explanation1': 'This doc and code are meant to write a boolean value to a stream. The code checks if the input n is False. If it is, it sets t to TRUE and writes it. If n is True, t remains FALSE and is written. This ensures that the boolean value is accurately recorded.\n</think>\n\nThis

## 10 Explanations Sample Result

In [1]:
import os
import torch
import logging
import sys
import numpy as np
import pandas as pd
from vllm import LLM, SamplingParams
import code_bert_score
import gc

logging.basicConfig(
    stream=sys.stdout,
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO
)
logger = logging.getLogger()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

class VLLMEvaluation:
    def __init__(self, model_path, model_type):
        self.device = self.setup_cuda()
        self.model = self.load_model(model_path)
        self.model_type = model_type
        self.sampling_params = SamplingParams(temperature=0.8, top_p=0.9, max_tokens=1000)
    
    def setup_cuda(self):
        return "cuda" if torch.cuda.is_available() else "cpu"

    def load_model(self, model_path):
        return LLM(model=model_path, dtype="bfloat16", device="cuda", enforce_eager=False)

    def generate_text(self, prompts):
        texts = []
        for prompt in prompts:
            output = self.model.generate([prompt], self.sampling_params)[0]
            texts.append(self.clean_output(output.outputs[0].text.strip(), "<think>"))
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        return texts

    def clean_output(self, text, keyword):
        index = text.rfind(keyword)
        return text[index + len(keyword):].strip() if index != -1 else text

    def explanation_to_code(self, description):
        prompt = (
            "Write only the Python function corresponding to the following description. "
            "Do not provide explanations, comments, markdown, parameter descriptions, or return values. "
            "Ensure that the function name and structure exactly match the description.\n\n"
            f"Description:\n{description}\n\nPython Code:\n"
        )
        return self.generate_text([prompt])[0]

    def compute_similarity(self, original_code, generated_code):
        P, R, F1, _ = code_bert_score.score(cands=[generated_code], refs=[original_code], lang='python')
        return F1.mean().item()

    def evaluate_generated_code(self, original_code, generated_code):
        exact_match = original_code.strip() == generated_code.strip()
        similarity = self.compute_similarity(original_code, generated_code)
        return exact_match, similarity

    def compute_rtc(self, sim_scores):
        return sum(sim_scores) / len(sim_scores) if sim_scores else 0.0

    def pass_at_1(self, n, c, k):
        return 1.0 if n - c < k else 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

    def process_data(self, df):
        results = []
        for iter, row in df.iterrows():
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
            gc.collect()

            original_code = str(row["code"]).strip()
            doc = str(row["doc"]).strip()
            explanations = [row[f"explanation_{self.model_type}_{i+1}"] for i in range(4)]

            codes, sim_scores, matches = [], [], []
            for explanation in explanations:
                generated_code = self.explanation_to_code(explanation)
                exact_match, similarity_score = self.evaluate_generated_code(original_code, generated_code)
                
                codes.append(generated_code)
                sim_scores.append(similarity_score)
                matches.append(exact_match)

            true_count = sum(1 for score in sim_scores if score > 0.7)
            final_rtcpass = self.compute_rtc(sim_scores)
            pass_score = self.pass_at_1(8, true_count, 1)

            results.append({
                "Original Code": original_code,
                "doc": doc,
                **{f"Exp_{self.model_type}{i+1}": explanations[i] for i in range(4)},
                **{f"Generated Code_{self.model_type}{i+1}": codes[i] for i in range(4)},
                f"Exact Match_{self.model_type}": matches,
                f"CodeBERTScore_{self.model_type}": sim_scores,
                f"RTCPass_{self.model_type}": final_rtcpass,
                f"Pass@1_{self.model_type}": pass_score
            })

            logger.info(f"Processed row {iter} for model {self.model_type}")
        return results

if __name__ == "__main__":
    deepseek_model_path = "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa"
    granite_model_path = "/datasets/ai/ibm-granite/hub/models--ibm-granite--granite-3.0-2b-instruct/snapshots/69e41fe735f54cec1792de2ac4f124b6cc84638f"
    
    input_csv = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_vllm.csv"
    output_csv = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_sample.csv"
    
    
    evaluator_deepseek = VLLMEvaluation(deepseek_model_path, "deepseek")
    df = pd.read_csv(input_csv).iloc[:10]
    deepseek_results = evaluator_deepseek.process_data(df)
    del evaluator_deepseek
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    
    evaluator_granite = VLLMEvaluation(granite_model_path, "granite")
    granite_results = evaluator_granite.process_data(df)
    del evaluator_granite
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    
    combined_results = []
    for d, g in zip(deepseek_results, granite_results):
        combined_results.append({**d, **g})
    
    logger.info("Writing combined results to CSV")
    pd.DataFrame(combined_results).to_csv(output_csv, index=False)


INFO 03-19 04:11:16 config.py:1013] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 03-19 04:11:16 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post1) with config: model='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', speculative_config=None, tokenizer='/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decodin

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 03-19 04:11:17 model_runner.py:997] Starting to load model /datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa...
INFO 03-19 04:11:17 selector.py:259] Cannot use FlashAttention-2 backend because the vllm_flash_attn package is not found. `pip install vllm-flash-attn` for better performance.
INFO 03-19 04:11:17 selector.py:116] Using XFormers backend.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-19 04:11:18 model_runner.py:1008] Loading model weights took 3.3460 GB
INFO 03-19 04:11:18 gpu_executor.py:122] # GPU blocks: 81535, # CPU blocks: 9362
INFO 03-19 04:11:21 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-19 04:11:21 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-19 04:11:32 model_runner.py:1428] Graph capturing finished in 11 secs.


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.62it/s, est. speed input: 229.09 toks/s, output: 63.36 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it, est. speed input: 90.54 toks/s, output: 166.13 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  6.19it/s, est. speed input: 2392.83 toks/s, output: 155.77 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.63it/s, est. speed input: 1497.02 toks/s, output: 160.26 toks/s]


2025-03-19 04:11:43,410 - INFO - Processed row 0 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s, est. speed input: 185.84 toks/s, output: 169.44 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.17it/s, est. speed input: 1734.01 toks/s, output: 155.61 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s, est. speed input: 208.75 toks/s, output: 130.47 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s, est. speed input: 738.56 toks/s, output: 163.93 toks/s]


2025-03-19 04:11:47,163 - INFO - Processed row 1 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.27it/s, est. speed input: 278.13 toks/s, output: 166.87 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s, est. speed input: 585.52 toks/s, output: 165.67 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s, est. speed input: 150.80 toks/s, output: 170.47 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it, est. speed input: 90.44 toks/s, output: 165.63 toks/s]


2025-03-19 04:11:56,462 - INFO - Processed row 2 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  7.85it/s, est. speed input: 1076.45 toks/s, output: 158.29 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.58s/it, est. speed input: 152.57 toks/s, output: 165.98 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 10.40it/s, est. speed input: 1354.67 toks/s, output: 156.29 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  5.25it/s, est. speed input: 1752.38 toks/s, output: 158.82 toks/s]


2025-03-19 04:12:02,206 - INFO - Processed row 3 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.93it/s, est. speed input: 307.72 toks/s, output: 168.37 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.25it/s, est. speed input: 1065.02 toks/s, output: 162.46 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.52it/s, est. speed input: 333.36 toks/s, output: 169.20 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.02s/it, est. speed input: 90.53 toks/s, output: 166.11 toks/s]


2025-03-19 04:12:11,057 - INFO - Processed row 4 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  8.20it/s, est. speed input: 786.53 toks/s, output: 157.29 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it, est. speed input: 417.06 toks/s, output: 164.23 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.75it/s, est. speed input: 940.74 toks/s, output: 162.35 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00, 17.90it/s, est. speed input: 2084.03 toks/s, output: 143.70 toks/s]


2025-03-19 04:12:15,431 - INFO - Processed row 5 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s, est. speed input: 234.75 toks/s, output: 167.95 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 504.73 toks/s, output: 164.90 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 117.53 toks/s, output: 170.75 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.01s/it, est. speed input: 90.85 toks/s, output: 166.40 toks/s]


2025-03-19 04:12:25,462 - INFO - Processed row 6 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.06it/s, est. speed input: 556.34 toks/s, output: 163.62 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 606.34 toks/s, output: 163.24 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.59it/s, est. speed input: 1417.82 toks/s, output: 158.40 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s, est. speed input: 644.39 toks/s, output: 165.90 toks/s]


2025-03-19 04:12:32,052 - INFO - Processed row 7 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  7.45it/s, est. speed input: 743.64 toks/s, output: 157.73 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s, est. speed input: 1482.80 toks/s, output: 157.51 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s, est. speed input: 127.41 toks/s, output: 170.68 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.54it/s, est. speed input: 749.65 toks/s, output: 165.73 toks/s]


2025-03-19 04:12:35,439 - INFO - Processed row 8 for model deepseek


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s, est. speed input: 441.49 toks/s, output: 163.83 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.51it/s, est. speed input: 1374.33 toks/s, output: 158.57 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s, est. speed input: 523.53 toks/s, output: 166.29 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s, est. speed input: 617.51 toks/s, output: 165.01 toks/s]


2025-03-19 04:12:38,789 - INFO - Processed row 9 for model deepseek
INFO 03-19 04:12:38 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post1) with config: model='/datasets/ai/ibm-granite/hub/models--ibm-granite--granite-3.0-2b-instruct/snapshots/69e41fe735f54cec1792de2ac4f124b6cc84638f', speculative_config=None, tokenizer='/datasets/ai/ibm-granite/hub/models--ibm-granite--granite-3.0-2b-instruct/snapshots/69e41fe735f54cec1792de2ac4f124b6cc84638f', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 03-19 04:12:40 model_runner.py:1008] Loading model weights took 4.7198 GB
INFO 03-19 04:12:41 gpu_executor.py:122] # GPU blocks: 28367, # CPU blocks: 3276
INFO 03-19 04:12:41 model_runner.py:1309] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 03-19 04:12:41 model_runner.py:1313] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 03-19 04:12:53 model_runner.py:1428] Graph capturing finished in 11 secs.


Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.78s/it, est. speed input: 24.85 toks/s, output: 102.26 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it, est. speed input: 394.73 toks/s, output: 91.53 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.06s/it, est. speed input: 49.65 toks/s, output: 104.52 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it, est. speed input: 108.11 toks/s, output: 103.15 toks/s]


2025-03-19 04:13:12,184 - INFO - Processed row 0 for model granite


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.51s/it, est. speed input: 71.07 toks/s, output: 97.82 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s, est. speed input: 426.30 toks/s, output: 99.23 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it, est. speed input: 91.93 toks/s, output: 103.64 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s, est. speed input: 385.97 toks/s, output: 102.33 toks/s]


2025-03-19 04:13:20,020 - INFO - Processed row 1 for model granite


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s, est. speed input: 143.67 toks/s, output: 104.21 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.08s/it, est. speed input: 494.86 toks/s, output: 101.20 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.08s/it, est. speed input: 69.86 toks/s, output: 104.55 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it, est. speed input: 439.66 toks/s, output: 101.40 toks/s]


2025-03-19 04:13:26,843 - INFO - Processed row 2 for model granite


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.50s/it, est. speed input: 40.87 toks/s, output: 104.32 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s, est. speed input: 1086.49 toks/s, output: 98.39 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.58s/it, est. speed input: 36.48 toks/s, output: 98.96 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.72it/s, est. speed input: 518.83 toks/s, output: 101.70 toks/s]


2025-03-19 04:13:37,038 - INFO - Processed row 3 for model granite


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.02s/it, est. speed input: 63.56 toks/s, output: 103.62 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.44s/it, est. speed input: 89.74 toks/s, output: 102.82 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.19s/it, est. speed input: 95.03 toks/s, output: 104.17 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it, est. speed input: 219.31 toks/s, output: 102.89 toks/s]


2025-03-19 04:13:50,491 - INFO - Processed row 4 for model granite


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.46s/it, est. speed input: 52.77 toks/s, output: 104.73 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it, est. speed input: 276.32 toks/s, output: 102.24 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.76s/it, est. speed input: 135.10 toks/s, output: 103.18 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.04it/s, est. speed input: 810.83 toks/s, output: 100.08 toks/s]


2025-03-19 04:13:58,611 - INFO - Processed row 5 for model granite


Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.64s/it, est. speed input: 17.13 toks/s, output: 103.79 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it, est. speed input: 260.33 toks/s, output: 102.34 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.90s/it, est. speed input: 65.32 toks/s, output: 104.83 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.47s/it, est. speed input: 294.81 toks/s, output: 102.36 toks/s]


2025-03-19 04:14:15,189 - INFO - Processed row 6 for model granite


Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it, est. speed input: 53.39 toks/s, output: 104.27 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.99s/it, est. speed input: 125.33 toks/s, output: 103.02 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.12s/it, est. speed input: 151.14 toks/s, output: 103.44 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.08s/it, est. speed input: 83.81 toks/s, output: 103.42 toks/s]


2025-03-19 04:14:30,049 - INFO - Processed row 7 for model granite


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s, est. speed input: 196.32 toks/s, output: 103.58 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it, est. speed input: 313.50 toks/s, output: 102.34 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it, est. speed input: 59.66 toks/s, output: 104.92 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s, est. speed input: 283.65 toks/s, output: 103.15 toks/s]


2025-03-19 04:14:36,521 - INFO - Processed row 8 for model granite


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.87s/it, est. speed input: 110.74 toks/s, output: 103.43 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it, est. speed input: 375.34 toks/s, output: 101.70 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.06s/it, est. speed input: 87.95 toks/s, output: 104.46 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it, est. speed input: 202.27 toks/s, output: 102.05 toks/s]


2025-03-19 04:14:45,881 - INFO - Processed row 9 for model granite
2025-03-19 04:14:45,891 - INFO - Writing combined results to CSV


In [7]:
import pandas as pd
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_10_sample_result.csv")
out.iloc[0]

Unnamed: 0                                                                   0
Generated_Code_deepseek_1    ```python\ndef flip_stream(n):\n    current_va...
Generated_Code_deepseek_2    def writeBoolean(self, n: int) -> None:\n    i...
Generated_Code_deepseek_3    ```python\ndef writeBoolean(n):\n    t = TYPE_...
Generated_Code_deepseek_4    The Python function is named writeBoolean and ...
Original_Code                def writeBoolean(self, n):\n        """\n     ...
corpus_id                                                                   d1
query_id                                                                    q1
Generated_Code_granite_1     ```python\ndef writeBoolean(self, n):\n    """...
Generated_Code_granite_2     ```python\ndef writeBoolean(self, n):\n    """...
Generated_Code_granite_3     ```python\ndef write_bool(stream, n):\n    t =...
Generated_Code_granite_4     ```python\ndef writeBoolean(self, n):\n    """...
Sim_Code_deepseek_1                                 

# VLLM - Entire Dataset Evaluation

## Not Cleaned explanation and not cleaned generated code

In [1]:
import pandas as pd
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_rtc_valid_result.csv")

print("********* Empty Strings/ NaN values ***********")
out.apply(lambda col: col.isna().sum() + (col == '').sum())



********* Empty Strings/ NaN values ***********


corpus_id                    0
query_id                     0
Original_Code                0
Generated_Code_deepseek_1    1
Generated_Code_deepseek_2    0
Generated_Code_deepseek_3    0
Generated_Code_deepseek_4    0
Generated_Code_granite_1     0
Generated_Code_granite_2     0
Generated_Code_granite_3     0
Generated_Code_granite_4     0
Sim_Code_deepseek_1          0
Exact_Match_deepseek_1       0
Sim_Code_deepseek_2          0
Exact_Match_deepseek_2       0
Sim_Code_deepseek_3          0
Exact_Match_deepseek_3       0
Sim_Code_deepseek_4          0
Exact_Match_deepseek_4       0
RTC_deepseek                 0
Pass@1_deepseek              0
Sim_Code_granite_1           0
Exact_Match_granite_1        0
Sim_Code_granite_2           0
Exact_Match_granite_2        0
Sim_Code_granite_3           0
Exact_Match_granite_3        0
Sim_Code_granite_4           0
Exact_Match_granite_4        0
RTC_granite                  0
Pass@1_granite               0
RTC_Common                   0
Pass@1_C

In [31]:
inputdf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_vllm1.csv")
from IPython.display import display

display(out[out['Generated_Code_deepseek_1'].isna() | (out['Generated_Code_deepseek_1'] == '')])
display(inputdf[(inputdf["corpus_id"] == "d16640") & (inputdf["query_id"] == "q16640")])



Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,Generated_Code_granite_3,...,Sim_Code_granite_2,Exact_Match_granite_2,Sim_Code_granite_3,Exact_Match_granite_3,Sim_Code_granite_4,Exact_Match_granite_4,RTC_granite,Pass@1_granite,RTC_Common,Pass@1_Common
16639,d16640,q16640,"def flush(self):\n """"""\n Flush a...",,Here is the code to follow:\n\n```python\ndef ...,"```python\ndef flush_class_cache():\n """"""Fl...",Write a Python function that represents the fl...,```python\ndef flush(self):\n if self._cach...,"```python\ndef flush(self):\n """"""\n This...",```python\ndef flush(self):\n if self.chang...,...,0.912509,False,0.668658,False,0.915452,False,0.816064,0.75,0.74292,0.625


Unnamed: 0,query_id,doc,corpus_id,code,explanation_deepseek_1,explanation_deepseek_2,explanation_deepseek_3,explanation_deepseek_4,explanation_granite_1,explanation_granite_2,explanation_granite_3,explanation_granite_4
16639,q16640,python flush request cache,d16640,"def flush(self):\n """"""\n Flush a...","The doc and code flush the request cache, remo...",Here is the step-by-step explanation of the co...,The code is trying to flush the cache in a cla...,The code is a method in a class that takes sel...,\nThe provided code and docstring are for a Py...,\nThis code snippet defines a method named `fl...,\nThis code snippet is for a class that handle...,\nThe provided code snippet is a method named ...


In [24]:
print("*************** Original Code ***************** \n",out.iloc[3]["Original_Code"])
print("*************** Generated_Code_deepseek_2 ***************** \n",out.iloc[3]["Generated_Code_deepseek_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_deepseek_2"])


*************** Original Code ***************** 
 def create_path(path):
    """Creates a absolute path in the file system.

    :param path: The path to be created
    """
    import os
    if not os.path.exists(path):
        os.makedirs(path)
*************** Generated_Code_deepseek_2 ***************** 
 import os

def create_path(path):
    if not os.path.exists(path):
        return os.path.abspath(os.path.join(path, os.path.dirname(os.path.abspath(__file__))))
    return path

Wait, no. If the path is relative, then the absolute path would be current_dir + path. So, if the current directory is 'src' and the path is 'file.txt', the absolute path is 'src/file.txt'. If the path is already absolute, like 'output', it should return 'output' without changing it.

So, the function should check if the given path is absolute. If it is, return it. Otherwise, create the absolute path by joining with the current directory.

But how to check if the path is absolute? Using os.path.abspath or os

In [18]:
print("*************** Original Code ***************** \n",out.iloc[0]["Original_Code"])
print("*************** Generated_Code_granite_2 ***************** \n",out.iloc[0]["Generated_Code_granite_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_granite_2"])


*************** Original Code ***************** 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)
*************** Generated_Code_granite_2 ***************** 
 ```python
def writeBoolean(self, n):
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value

## Not Cleaned Explanation and cleaned generated code

In [3]:
import pandas as pd
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_metrics_results_file.csv")
print("********* Empty Strings/ NaN values ***********")
out.apply(lambda col: col.isna().sum() + (col == '').sum())



********* Empty Strings/ NaN values ***********


Unnamed: 0                      0
corpus_id                       0
query_id                        0
Original_Code                   0
Generated_Code_deepseek_1    1505
Generated_Code_deepseek_2    1951
Generated_Code_deepseek_3    1747
Generated_Code_deepseek_4    2136
Generated_Code_granite_1        0
Generated_Code_granite_2        0
Generated_Code_granite_3        0
Generated_Code_granite_4        0
Sim_Code_deepseek_1             0
Exact_Match_deepseek_1          0
Sim_Code_deepseek_2             0
Exact_Match_deepseek_2          0
Sim_Code_deepseek_3             0
Exact_Match_deepseek_3          0
Sim_Code_deepseek_4             0
Exact_Match_deepseek_4          0
RTC_deepseek                    0
Pass@1_deepseek                 0
dtype: int64

In [5]:
out["RTC_deepseek"].mean(), out["Pass@1_deepseek"].mean()

(0.7689970665054631, 0.8838453698311007)

In [6]:
inputdf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_vllm1.csv")
codedf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_generated_code_results.csv")

from IPython.display import display

display(out[out['Generated_Code_deepseek_1'].isna() | (out['Generated_Code_deepseek_1'] == '')].head())
display(inputdf[(inputdf["corpus_id"] == "d24") & (inputdf["query_id"] == "q24")])
display(codedf[(codedf["corpus_id"] == "d24") & (codedf["query_id"] == "q24")])



Unnamed: 0.1,Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,...,Sim_Code_deepseek_1,Exact_Match_deepseek_1,Sim_Code_deepseek_2,Exact_Match_deepseek_2,Sim_Code_deepseek_3,Exact_Match_deepseek_3,Sim_Code_deepseek_4,Exact_Match_deepseek_4,RTC_deepseek,Pass@1_deepseek
5,5,d6,q6,"def experiment_property(prop):\n """"""Get a p...",,def experiment_property(prop):\n exp = expe...,def experiment_property(prop):\n exp = expe...,,```python\ndef experiment_property(prop):\n ...,```python\ndef experiment_property(prop):\n ...,...,0.0,False,0.950276,False,0.840602,False,0.0,False,0.447719,0.5
23,23,d24,q24,"def __add__(self, other):\n """"""Handle t...",,,,,"```python\ndef __add__(self, other):\n """"""\...","```python\ndef __add__(self, other):\n """"""H...",...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
30,30,d31,q31,"def context(self):\n """"""\n Creat...",,,,def context(self):\n try:\n yield se...,```python\nclass ContextManager:\n def __en...,"```python\ndef context(self):\n """"""\n Th...",...,0.0,False,0.0,False,0.0,False,0.803796,False,0.200949,0.25
73,73,d74,q74,"async def list(source):\n """"""Generate a sin...",,def list(streamer: list) -> list:\n result ...,,,```python\nimport asyncio\n\nasync def list(so...,"```python\nasync def list(source):\n """"""Gen...",...,0.0,False,0.881107,False,0.0,False,0.0,False,0.220277,0.25
76,76,d77,q77,def get_next_scheduled_time(cron_string):\n ...,,def get_next_scheduled_time(cron_string):\n ...,def get_next_scheduled_time(cron_string):\n ...,def get_next_scheduled_time(cron_str):\n # ...,```python\nfrom croniter import croniter\nfrom...,"```python\nfrom datetime import datetime, time...",...,0.0,False,1.0,True,0.89534,False,0.901514,False,0.699214,0.75


Unnamed: 0,query_id,doc,corpus_id,code,explanation_deepseek_1,explanation_deepseek_2,explanation_deepseek_3,explanation_deepseek_4,explanation_granite_1,explanation_granite_2,explanation_granite_3,explanation_granite_4
23,q24,a+b in python addition code,d24,"def __add__(self, other):\n """"""Handle t...",The __add__ method in Python handles the addit...,The code snippet is a Python method for handli...,</think>\n\nThis code snippet is about how Pyt...,</think>\n\nThe code snippet provided implemen...,\nThis Python code defines a method `__add__` ...,\nThis code snippet is a method definition in ...,\nThis code snippet is a special function in P...,\nThis code snippet is a method definition for...


Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,Generated_Code_granite_3,Generated_Code_granite_4
23,d24,q24,"def __add__(self, other):\n """"""Handle t...","```\nclass MyClass:\n def __add__(self, oth...",```python\nclass MyClass:\n def __add__(sel...,"```python\nclass Adder:\n def __add__(self,...","```python\nclass Add:\n def __add__(self, o...","```python\ndef __add__(self, other):\n """"""\...","```python\ndef __add__(self, other):\n """"""H...",```python\nclass MyClass:\n def __init__(se...,"```python\ndef __add__(self, other):\n """"""H..."


In [47]:
display(out[out['Generated_Code_deepseek_1'].isna() | (out['Generated_Code_deepseek_1'] == '')].head())
display(inputdf[(inputdf["corpus_id"] == "d74") & (inputdf["query_id"] == "q74")])
display(codedf[(codedf["corpus_id"] == "d74") & (codedf["query_id"] == "q74")])

Unnamed: 0.1,Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,...,Sim_Code_deepseek_1,Exact_Match_deepseek_1,Sim_Code_deepseek_2,Exact_Match_deepseek_2,Sim_Code_deepseek_3,Exact_Match_deepseek_3,Sim_Code_deepseek_4,Exact_Match_deepseek_4,RTC_deepseek,Pass@1_deepseek
5,5,d6,q6,"def experiment_property(prop):\n """"""Get a p...",,def experiment_property(prop):\n exp = expe...,def experiment_property(prop):\n exp = expe...,,```python\ndef experiment_property(prop):\n ...,```python\ndef experiment_property(prop):\n ...,...,0.0,False,0.950276,False,0.840602,False,0.0,False,0.447719,0.5
23,23,d24,q24,"def __add__(self, other):\n """"""Handle t...",,,,,"```python\ndef __add__(self, other):\n """"""\...","```python\ndef __add__(self, other):\n """"""H...",...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
30,30,d31,q31,"def context(self):\n """"""\n Creat...",,,,def context(self):\n try:\n yield se...,```python\nclass ContextManager:\n def __en...,"```python\ndef context(self):\n """"""\n Th...",...,0.0,False,0.0,False,0.0,False,0.803796,False,0.200949,0.25
73,73,d74,q74,"async def list(source):\n """"""Generate a sin...",,def list(streamer: list) -> list:\n result ...,,,```python\nimport asyncio\n\nasync def list(so...,"```python\nasync def list(source):\n """"""Gen...",...,0.0,False,0.881107,False,0.0,False,0.0,False,0.220277,0.25
76,76,d77,q77,def get_next_scheduled_time(cron_string):\n ...,,def get_next_scheduled_time(cron_string):\n ...,def get_next_scheduled_time(cron_string):\n ...,def get_next_scheduled_time(cron_str):\n # ...,```python\nfrom croniter import croniter\nfrom...,"```python\nfrom datetime import datetime, time...",...,0.0,False,1.0,True,0.89534,False,0.901514,False,0.699214,0.75


Unnamed: 0,query_id,doc,corpus_id,code,explanation_deepseek_1,explanation_deepseek_2,explanation_deepseek_3,explanation_deepseek_4,explanation_granite_1,explanation_granite_2,explanation_granite_3,explanation_granite_4
73,q74,async list comprehension python,d74,"async def list(source):\n """"""Generate a sin...",</think>\n\nThe doc and code mean that the fun...,</think>\n\n1. **Line 1**: `result = []` \n ...,</think>\n\nThis code snippet is a function ca...,"Alright, I'm trying to understand this async l...",\nThe provided code defines an asynchronous fu...,\n1. `async def list(source):` - This line def...,\nThis code snippet defines an asynchronous fu...,\nThe code snippet provided is an asynchronous...


Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,Generated_Code_granite_3,Generated_Code_granite_4
73,d74,q74,"async def list(source):\n """"""Generate a sin...",```python\nasync for item in streamer:\n re...,```python\ndef list(streamer: list) -> list:\n...,```python\nasync def list(source):\n result...,async def list(source):\n result = []\n ...,```python\nimport asyncio\n\nasync def list(so...,"```python\nasync def list(source):\n """"""Gen...",```python\nimport asyncio\n\nasync def list(so...,```python\nasync def list(source):\n result...


In [7]:
print("*************** Original Code ***************** \n",out.iloc[0]["Original_Code"])
print("*************** Generated_Code_deepseek_2 ***************** \n",out.iloc[0]["Generated_Code_deepseek_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_deepseek_2"])

*************** Original Code ***************** 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)
*************** Generated_Code_deepseek_2 ***************** 
 def writeBoolean(self, n: int) -> None:
    if n:
        self.stream.write(True)
    else:
        self.stream.write(False)
*************** Similarity Score ***************** 
 0.8175770044326782


#### Granite

In [9]:
import pandas as pd
inputdf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_vllm1.csv")
codedf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_generated_code_results.csv")
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/granite_exps_metrics_results.csv")
out.apply(lambda col: col.isna().sum() + (col == '').sum())





Unnamed: 0                      0
corpus_id                       0
query_id                        0
Original_Code                   0
Generated_Code_deepseek_1       1
Generated_Code_deepseek_2       0
Generated_Code_deepseek_3       0
Generated_Code_deepseek_4       0
Generated_Code_granite_1      940
Generated_Code_granite_2      870
Generated_Code_granite_3      616
Generated_Code_granite_4     1563
Sim_Code_granite_1              0
Exact_Match_granite_1           0
Sim_Code_granite_2              0
Exact_Match_granite_2           0
Sim_Code_granite_3              0
Exact_Match_granite_3           0
Sim_Code_granite_4              0
Exact_Match_granite_4           0
RTC_granite                     0
Pass@1_granite                  0
dtype: int64

In [10]:
from IPython.display import display

display(out[out['Generated_Code_granite_2'].isna() | (out['Generated_Code_granite_2'] == '')].head())

Unnamed: 0.1,Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,...,Sim_Code_granite_1,Exact_Match_granite_1,Sim_Code_granite_2,Exact_Match_granite_2,Sim_Code_granite_3,Exact_Match_granite_3,Sim_Code_granite_4,Exact_Match_granite_4,RTC_granite,Pass@1_granite
25,25,d26,q26,"def get_column(self, X, column):\n """"""R...","```python\ndef get_column(matrix, index=None, ...","```python\ndef get_column(self, X, column):\n ...","def get_column(X, column):\n if isinstance(...","```python\ndef get_column(X, column):\n if ...","def get_column(X, column):\n if isinstance(...",,...,0.842354,False,0.0,False,0.837411,False,0.845411,False,0.631294,0.75
37,37,d38,q38,"def __add__(self,other):\n """"""\n ...","```python\ndef __add__(self, other):\n """"""\...",```python\nclass LabeledMatrix:\n def __add...,"```python\ndef __add__(self, other):\n if n...",```python\ndef count_frequencies(s):\n freq...,,,...,0.0,False,0.0,False,0.750286,False,0.0,False,0.187572,0.25
73,73,d74,q74,"async def list(source):\n """"""Generate a sin...",```python\nasync for item in streamer:\n re...,```python\ndef list(streamer: list) -> list:\n...,```python\nasync def list(source):\n result...,async def list(source):\n result = []\n ...,,,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
99,99,d100,q100,"def table_top_abs(self):\n """"""Returns t...",def get_table_top_height():\n # Calculate t...,def table_top_abs(self):\n table_height = n...,```python\ndef table_top_abs(self):\n retur...,import numpy as np\n\ndef table_top_abs(self):...,def table_top_abs(floor):\n table_height = ...,,...,0.864315,False,0.0,False,0.913406,False,0.995568,False,0.693323,0.75
112,112,d113,q113,"def parse(self, s):\n """"""\n Pars...",The Python function is named `date_from_str` a...,"def parse(self, s):\n return datetime.datet...",```python\nclass DateParser:\n def __init__...,```python\nimport datetime\nimport calendar\n\...,,,...,0.0,False,0.0,False,0.828415,False,0.0,False,0.207104,0.25


In [11]:
out["RTC_granite"].mean(), out["Pass@1_granite"].mean()


(0.8381319595339911, 0.9371481265773636)

In [None]:

print("*************** Original Code ***************** \n",out.iloc[0]["Original_Code"])
print("*************** Generated_Code_granite_2 ***************** \n",out.iloc[0]["Generated_Code_granite_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_granite_2"])

## DeepSeek - Cleaned Explanation and cleaned generated code

In [18]:
import pandas as pd
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_code_1/metrics/cleaned_deepseek_metrics_result.csv")
print("********* Empty Strings/ NaN values ***********")
out.apply(lambda col: col.isna().sum() + (col == '').sum())



********* Empty Strings/ NaN values ***********


Unnamed: 0                      0
Generated_Code_deepseek_1    3777
Generated_Code_deepseek_2    2051
Generated_Code_deepseek_3    2303
Generated_Code_deepseek_4    1724
Original_Code                   0
corpus_id                       0
query_id                        0
Sim_Code_deepseek_1             0
Exact_Match_deepseek_1          0
Sim_Code_deepseek_2             0
Exact_Match_deepseek_2          0
Sim_Code_deepseek_3             0
Exact_Match_deepseek_3          0
Sim_Code_deepseek_4             0
Exact_Match_deepseek_4          0
RTC_deepseek                    0
Pass@1_deepseek                 0
dtype: int64

In [7]:
out["RTC_deepseek"].mean(), out["Pass@1_deepseek"].mean()

(0.726913331027324, 0.8099398175111628)

In [None]:
codedf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/cleaned_deepseek_exps_result.csv")
df1 = codedf[["Generated_Code_deepseek_2", "query_id", "corpus_id"]]
df2 = out[out['Generated_Code_deepseek_2'].isna() | (out['Generated_Code_deepseek_2'] == '')][["Generated_Code_deepseek_2", "query_id", "corpus_id"]].rename({"Generated_Code_deepseek_2": "Cleaned_Generated_Code_deepseek_2"})
result_df=pd.merge(df1, df2, on="query_id" and "corpus_id",how ="inner")
result_df.to_csv("/work/pi_wenlongzhao_umass_edu/27/janet/cleaned_deepseek2_missing_val.csv")


In [19]:
inputdf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_query_code.csv")
codedf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_code_1/code_generation/cleaned_deepseek_exps_result.csv")

display(out[out['Generated_Code_deepseek_2'].isna() | (out['Generated_Code_deepseek_2'] == '')].head())
display(inputdf[(inputdf["corpus_id"] == "d9") & (inputdf["query_id"] == "q9")])
display(codedf[(codedf["corpus_id"] == "d9") & (codedf["query_id"] == "q9")])

Unnamed: 0.1,Unnamed: 0,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Original_Code,corpus_id,query_id,Sim_Code_deepseek_1,Exact_Match_deepseek_1,Sim_Code_deepseek_2,Exact_Match_deepseek_2,Sim_Code_deepseek_3,Exact_Match_deepseek_3,Sim_Code_deepseek_4,Exact_Match_deepseek_4,RTC_deepseek,Pass@1_deepseek
8,8,,,def timespan(start_time):\n # code\n ret...,def timespan(start_time):\n return (datetim...,"def timespan(start_time):\n """"""Return time ...",d9,q9,0.0,False,0.0,False,0.805496,False,0.921541,False,0.431759,0.5
30,30,,,"def __exit__(self, exc_type, exc_val, exc_tb):...",def context(self):\n try:\n # code\n...,"def context(self):\n """"""\n Creat...",d31,q31,0.0,False,0.0,False,0.658593,False,0.704185,False,0.340695,0.25
37,37,,,"def add(a, b):\n # ... code ...\n return...","def calculate_correlation(a, b):\n return n...","def __add__(self,other):\n """"""\n ...",d38,q38,0.0,False,0.0,False,0.640751,False,0.684916,False,0.331417,0.0
61,61,"def round_to_int(number, precision):\n retu...",,"def round_to_int(number, precision):\n roun...","def round_to_int(number, precision):\n prec...","def round_to_int(number, precision):\n """"""R...",d62,q62,0.836063,False,0.0,False,0.889443,False,0.941842,False,0.666837,0.75
64,64,def create_abs_path(path):\n absolute_path ...,,def absolute_path(p):\n import os\n impo...,def create_path(path):\n import os\n if ...,"def create_path(path):\n """"""Creates a absol...",d65,q65,0.767821,False,0.0,False,0.81094,False,0.901522,False,0.620071,0.75


Unnamed: 0.1,Unnamed: 0,query_id,doc,corpus_id,code,explanation_granite_1,explanation_granite_2,explanation_granite_3,explanation_granite_4,explanation_deepseek_1_cleaned,explanation_deepseek_2_cleaned,explanation_deepseek_3_cleaned,explanation_deepseek_4_cleaned
8,8,q9,python compare timespan to number,d9,"def timespan(start_time):\n """"""Return time ...",\nThe provided code defines a function called ...,\n1. `def timespan(start_time):` - This line d...,\nThis code snippet defines a function called ...,\nThe code snippet defines a function called `...,The function `timespan` calculates the duratio...,"def timespan(start_time): """"""Return time in...",The code snippet is a function called timespan...,The code snippet is a function called timespan...


Unnamed: 0,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Original_Code,corpus_id,query_id
8,"timespan(start_time, current_time)\ntimespan(1...","def timespan(start_time): \n """""" """""" \n ...",def timespan(start_time):\n # code\n ret...,The code snippet is a function named `timespan...,"def timespan(start_time):\n """"""Return time ...",d9,q9


In [20]:
codedf.shape

(20604, 7)

In [36]:
print("*************** Original Code ***************** \n",out.iloc[0]["Original_Code"])
print("*************** Generated_Code_deepseek_2 ***************** \n",out.iloc[0]["Generated_Code_deepseek_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_deepseek_2"])

*************** Original Code ***************** 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)
*************** Generated_Code_deepseek_2 ***************** 
 def writeBoolean(self, n):
    t = TYPE_BOOL_TRUE
    if n == False:
        t = TYPE_BOOL_FALSE
    self.stream.write(t)
*************** Similarity Score ***************** 
 0.8176217079162598


## Granite - Cleaned Explanation and Cleaned Generated Code

In [9]:
import pandas as pd
out=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/cleaned_granite_metrics_result.csv")
print("********* Empty Strings/ NaN values ***********")
out.apply(lambda col: col.isna().sum() + (col == '').sum())



********* Empty Strings/ NaN values ***********


Unnamed: 0                     0
Generated_Code_granite_1     918
Generated_Code_granite_2     875
Generated_Code_granite_3     591
Generated_Code_granite_4    1585
Original_Code                  0
corpus_id                      0
query_id                       0
Sim_Code_granite_1             0
Exact_Match_granite_1          0
Sim_Code_granite_2             0
Exact_Match_granite_2          0
Sim_Code_granite_3             0
Exact_Match_granite_3          0
Sim_Code_granite_4             0
Exact_Match_granite_4          0
RTC_granite                    0
Pass@1_granite                 0
dtype: int64

In [10]:
out["RTC_granite"].mean(), out["Pass@1_granite"].mean()

(0.8384574330993138, 0.9375849349640847)

In [38]:
print("*************** Original Code ***************** \n",out.iloc[0]["Original_Code"])
print("*************** Generated_Code_granite_2 ***************** \n",out.iloc[0]["Generated_Code_granite_2"])
print("*************** Similarity Score ***************** \n",out.iloc[3]["Sim_Code_granite_2"])

*************** Original Code ***************** 
 def writeBoolean(self, n):
        """
        Writes a Boolean to the stream.
        """
        t = TYPE_BOOL_TRUE

        if n is False:
            t = TYPE_BOOL_FALSE

        self.stream.write(t)
*************** Generated_Code_granite_2 ***************** 
 def writeBoolean(self, n):
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the stream.

    Parameters:
    n (bool): The Boolean value to be written to the stream.
    """
    """
    This function writes a Boolean value to the st

# SWE-RL

In [5]:
import swerl.src.swerl as sw

file = """
def sort_list(lst):
    return sorted(lst)
""".strip()

oracle_file = """
def sort_list(lst: list[int]) -> list[int]:
    return sorted(lst)
""".strip()

context = {"example.py": file}
oracle = {"example.py": oracle_file}

output = """
<think>
...thoughts by LLM
</think>
<solution>
```python
### example.py
<<<<<<< SEARCH
def sort_list(lst):
=======
def sort_list(lst: list[int]) -> list[int]:
>>>>>>> REPLACE
```
</solution>
""".strip()

reward, metadata = sw.core.reward.calculate_search_replace_reward(context, oracle, output)
assert reward == 1.0
print(metadata)



In [18]:
import swerl.src.swerl as sw
import pandas as pd

# Load CSVs
out = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/results/exps_10_sample_result.csv").iloc[0]
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_vllm1.csv")

# Match keys
querid_val = out['query_id']
corpus_id = out['corpus_id']
code = out['Original_Code']

# Filter explanation
result_exp = df.loc[
    (df['query_id'] == querid_val) &
    (df['corpus_id'] == corpus_id) &
    (df['code'] == code),
    'explanation_deepseek_1'
]

# Proceed if explanation exists
if not result_exp.empty:
    explanation = result_exp.values[0].replace("</think>", "").strip()
    #explanation = result_exp.values[0]

    original_code = {"example.py": out["Original_Code"]}
    generated_code = {"example.py": out["Generated_Code_deepseek_1"]}
    output = explanation

    # Show everything for debugging
    print("Original Code:\n", original_code)
    print("\nGenerated Code:\n", generated_code)
    print("\nExplanation:\n", output)

    reward, metadata = sw.core.reward.calculate_search_replace_reward(generated_code, original_code, output)
    print("\nReward:", reward)
    print("Metadata:", metadata)

    # Safer check
    if reward != 1.0:
        print("⚠️ Explanation did not lead to perfect match. Review metadata above.")
    else:
        print("Reward is 1.0! Explanation was effective.")
else:
    print("No matching explanation found.")


Original Code:
 {'example.py': 'def writeBoolean(self, n):\n        """\n        Writes a Boolean to the stream.\n        """\n        t = TYPE_BOOL_TRUE\n\n        if n is False:\n            t = TYPE_BOOL_FALSE\n\n        self.stream.write(t)'}

Generated Code:
 {'example.py': '```python\ndef flip_stream(n):\n    current_value = 1 if (n % 2 == 1) else 0\n    return current_value\n```'}

Explanation:
 The doc string and code both write the boolean value 1 to the stream. The code uses a condition based on n, flipping between TRUE and FALSE values. The stream is a mechanism for writing to a binary stream.


The docstring and code both write the boolean value 1 to the stream. The code uses a condition based on n, flipping between TRUE and FALSE values. The stream is a mechanism for writing to a binary stream.

Reward: -1.0
Metadata: {'error': 'count of <think> is not 1'}
⚠️ Explanation did not lead to perfect match. Review metadata above.


In [18]:
print("Original Code:")
print(original_code)

print("\nGenerated Code:")
print(generated_code)

print("\nExplanation Output:")
print(output)

print("\nReward and Metadata:")
reward, metadata = sw.core.reward.calculate_search_replace_reward(generated_code, original_code, output)
print("Reward:", reward)
print("Metadata:", metadata)


Original Code:
{'example.py': 'def writeBoolean(self, n):\n        """\n        Writes a Boolean to the stream.\n        """\n        t = TYPE_BOOL_TRUE\n\n        if n is False:\n            t = TYPE_BOOL_FALSE\n\n        self.stream.write(t)'}

Generated Code:
{'example.py': '```python\ndef flip_stream(n):\n    current_value = 1 if (n % 2 == 1) else 0\n    return current_value\n```'}

Explanation Output:
The doc string and code both write the boolean value 1 to the stream. The code uses a condition based on n, flipping between TRUE and FALSE values. The stream is a mechanism for writing to a binary stream.


The docstring and code both write the boolean value 1 to the stream. The code uses a condition based on n, flipping between TRUE and FALSE values. The stream is a mechanism for writing to a binary stream.

Reward and Metadata:
Reward: -1.0
Metadata: {'error': 'count of <think> is not 1'}


# Cleaning of Code - Round 2

In [9]:
import ast
import re
import textwrap

def clean_to_function_or_class(text):
    if not isinstance(text, str):
        return ""

    text = textwrap.dedent(text).strip()

    def extract_method_signature(text):
        """
        Extracts method signatures in the format ClassName.method_name(self, ...)
        """
        if not isinstance(text, str):
            return ""

        # Match things like: LabeledMatrix.add(self, other)
        pattern = re.compile(r'\b(\w+\.\w+\s*\([^)]*\))')
        matches = pattern.findall(text)

        return matches[-1].strip() if matches else ""
    
    def remove_empty_docstring(code):
        lines = code.splitlines()
        cleaned = []
        skip_next = False
        for i, line in enumerate(lines):
            if i < len(lines) - 1 and re.match(r'^\s*"""\s*"""\s*$', line) and re.match(r'^\s*"""\s*"""\s*$', lines[i+1]):
                skip_next = True
                continue
            elif skip_next:
                skip_next = False
                continue
            cleaned.append(line)
        return "\n".join(cleaned)

    try:
        tree = ast.parse(text)
        for node in reversed(tree.body):
            if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
                segment = ast.get_source_segment(text, node)
                return remove_empty_docstring(segment)
    except SyntaxError:
        pass

    # More flexible fallback regex: allow any indent and line count
    func_pattern = re.compile(
        r"^(def\s+\w+\(.*?\):\s*\n(?:[ \t]+.+\n?)+)", re.MULTILINE)
    class_pattern = re.compile(
        r"^(class\s+\w+.*?:\s*\n(?:[ \t]+.+\n?)+)", re.MULTILINE)


    matches = func_pattern.findall(text) or class_pattern.findall(text)
    if matches:
        return remove_empty_docstring(matches[-1].strip())

    return extract_method_signature(text)


In [10]:
explanation = """def timespan(start_time):  
    """""" """"""  
    timespan = datetime.datetime.now() - start_time  
    timespan_ms = timespan.total_seconds() * 1000  
    return timespan_ms  
  

Wait, I think I might have missed something in the description. Let me read it again. The function is called timespan, and it's supposed to return the time in milliseconds from start_time. The start_time is passed as a parameter. The function uses datetime.datetime.now() to get the current time, subtracts start_time, converts to total seconds, multiplies by 1000 to get milliseconds, and returns that.

So, the Python function should take start_time as an argument, compute the difference, convert to milliseconds, and return.

Looking at the code provided, the function is written as def timespan(start_time):, which matches the description. Then the docstring is written as """"""Return time in milliseconds from start_time"""""" followed by the code.

But in the code I wrote, the function does return timespan_ms, which is correct. So the code seems to match.

Wait, but in the description, the first line is a function definition. Then the docstring is multi-line. But in the code I wrote, the docstring is empty, and the code is correctly written. So perhaps the initial code provided by the user is wrong. The user wants the function name and structure to exactly match the description.

So, according to the description, the function should be named timespan, take a parameter start_time, and return the difference in milliseconds.

Looking at the code the user provided, it's written as:

def timespan(start_time):  
    """""" """"""  
    timespan = datetime.datetime.now() - start_time  
    timespan_ms = timespan.total_seconds() * 1000  
    return timespan_ms  

Which is not matching. The first line is a function definition. Then the docstring is empty, and the code is correct. So that doesn't match the description.

But the user's code to write is the function. So perhaps I should write the correct function code, regardless of the provided code.

Wait, the user said: ""Do not provide explanations, comments, markdown, parameter descriptions, or return values. Ensure that the function name and structure exactly match the description.""

So I need to write the Python function as per the description. So the function should be named timespan, take start_time as a parameter, compute the difference, return in milliseconds.

In Python, that would be:

def timespan(start_time):  
    timespan = datetime.datetime.now() - start_time  
    timespan_ms = timespan.total_seconds() * 1000  
    return timespan_ms  

So that's the code.

Looking back at the code I provided, I had:

def timespan(start_time):  
    """""" """"""  
    timespan = datetime.datetime.now() - start_time  
    timespan_ms = timespan.total_seconds() * 1000  
    return timespan_ms  

Which is not correct, because the docstring is empty. So perhaps I made a mistake earlier.

But according to the description, the function is named timespan, takes start_time as a parameter, and returns the time in milliseconds.

So the correct code is as I wrote above.

So the Python function is as written.
</think>

def timespan(start_time):  
    timespan = datetime.datetime.now() - start_time  
    timespan_ms = timespan.total_seconds() * 1000  
    return timespan_ms"""
clean_to_function_or_class(explanation)


'def timespan(start_time):  \n    timespan = datetime.datetime.now() - start_time  \n    timespan_ms = timespan.total_seconds() * 1000  \n    return timespan_ms'

In [11]:
exp2 = """
"</think>

To create a Python context manager that adheres to the described functionality, we can define the `context` function as follows:

```python
class ActionContextManager:
    def context(self):
        parent = self._ACTION_CONTEXT.set(self)
        try:
            yield self
        finally:
            self._ACTION_CONTEXT.reset(parent)
```"
"""
clean_to_function_or_class(exp2)


'class ActionContextManager:\n    def context(self):\n        parent = self._ACTION_CONTEXT.set(self)\n        try:\n            yield self\n        finally:\n            self._ACTION_CONTEXT.reset(parent)'

In [12]:
exp3="""
"LabeledMatrix.add(self, other)

Let's start by opening a docstring that explains the purpose of the code snippet, which is to add two matrices with the same shape, expecting one dimension in Python.

Then, the method checks for the assertion that the number of columns in self.matrix and other.matrix are equal. If not, it raises an error.

After that, it concatenates the self.matrix and other.matrix along axis=0, creating a new matrix.

Finally, it returns a new LabeledMatrix object with the concatenated matrix and the same labels as self.

Each statement in the code snippet is crucial for the functionality. The docstring provides context and purpose, the method checks for consistency before proceeding, the concatenation combines the matrices correctly, and returning the new object ensures the class maintains its structure.

Now, let's write the Python function corresponding to the description.
</think>

LabeledMatrix.add(self, other)"
"""
clean_to_function_or_class(exp3)


'LabeledMatrix.add(self, other)'

In [21]:
import pandas as pd
inputdf = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_explanations_query_code.csv")
csv = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/cleaned_deepseek_missing_val.csv")
csv2 = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_code_1/code_generation/cleaned_deepseek_exps_result.csv")
cleaned = [clean_to_function_or_class(desc) for desc in csv2["Generated_Code_deepseek_2"]]
cleaned_series = pd.Series(cleaned)

# Find missing or empty values
missing_in_cleaned = cleaned_series.isnull() | (cleaned_series == "")
missing_in_generated = csv["Generated_Code_deepseek_2"].isnull() | (csv["Generated_Code_deepseek_2"].astype(str).str.strip() == "")

# Print counts
print("Missing or empty values in cleaned:", missing_in_cleaned.sum())
print("Missing or empty values in csv['Generated_Code_deepseek_2']:", missing_in_generated.sum())

# Optional: Print indices
print("Indices with missing/empty in cleaned:", cleaned_series[missing_in_cleaned].index.tolist())
print("Indices with missing/empty in Generated_Code_deepseek_2:", csv[missing_in_generated].index.tolist())





Missing or empty values in cleaned: 872
Missing or empty values in csv['Generated_Code_deepseek_2']: 2051
Indices with missing/empty in cleaned: [61, 78, 101, 111, 118, 191, 222, 244, 256, 317, 326, 362, 365, 519, 535, 556, 635, 647, 689, 699, 702, 810, 838, 848, 884, 907, 935, 939, 964, 966, 1032, 1054, 1090, 1111, 1139, 1141, 1178, 1202, 1252, 1281, 1320, 1327, 1363, 1373, 1464, 1472, 1512, 1530, 1577, 1662, 1731, 1736, 1756, 1757, 1807, 1827, 1846, 1901, 1958, 1975, 2049, 2056, 2083, 2097, 2102, 2130, 2165, 2182, 2324, 2336, 2377, 2395, 2435, 2467, 2502, 2538, 2611, 2628, 2633, 2665, 2772, 2788, 2792, 2797, 2827, 2836, 2850, 2911, 2928, 2965, 3015, 3044, 3060, 3144, 3168, 3197, 3198, 3202, 3209, 3210, 3221, 3240, 3271, 3276, 3291, 3295, 3340, 3361, 3378, 3471, 3516, 3548, 3568, 3570, 3659, 3663, 3673, 3677, 3718, 3758, 3767, 3778, 3795, 3814, 3832, 3841, 3877, 3894, 3930, 3995, 4020, 4098, 4138, 4153, 4170, 4189, 4231, 4292, 4328, 4379, 4414, 4431, 4443, 4487, 4489, 4552, 4558, 4562

In [22]:
cleaned_series[missing_in_cleaned].index.tolist()


[61,
 78,
 101,
 111,
 118,
 191,
 222,
 244,
 256,
 317,
 326,
 362,
 365,
 519,
 535,
 556,
 635,
 647,
 689,
 699,
 702,
 810,
 838,
 848,
 884,
 907,
 935,
 939,
 964,
 966,
 1032,
 1054,
 1090,
 1111,
 1139,
 1141,
 1178,
 1202,
 1252,
 1281,
 1320,
 1327,
 1363,
 1373,
 1464,
 1472,
 1512,
 1530,
 1577,
 1662,
 1731,
 1736,
 1756,
 1757,
 1807,
 1827,
 1846,
 1901,
 1958,
 1975,
 2049,
 2056,
 2083,
 2097,
 2102,
 2130,
 2165,
 2182,
 2324,
 2336,
 2377,
 2395,
 2435,
 2467,
 2502,
 2538,
 2611,
 2628,
 2633,
 2665,
 2772,
 2788,
 2792,
 2797,
 2827,
 2836,
 2850,
 2911,
 2928,
 2965,
 3015,
 3044,
 3060,
 3144,
 3168,
 3197,
 3198,
 3202,
 3209,
 3210,
 3221,
 3240,
 3271,
 3276,
 3291,
 3295,
 3340,
 3361,
 3378,
 3471,
 3516,
 3548,
 3568,
 3570,
 3659,
 3663,
 3673,
 3677,
 3718,
 3758,
 3767,
 3778,
 3795,
 3814,
 3832,
 3841,
 3877,
 3894,
 3930,
 3995,
 4020,
 4098,
 4138,
 4153,
 4170,
 4189,
 4231,
 4292,
 4328,
 4379,
 4414,
 4431,
 4443,
 4487,
 4489,
 4552,
 4558,
 45

In [24]:
csv2.iloc[61]["Generated_Code_deepseek_2"]

"number = 1.2345\nprec = 0.678\nprec = int(prec)\nnumber = int(number)\nrounded = (int(number) + prec/2) // prec * prec\nreturn rounded\n\nWait, but I think that (int(number) + prec/2) // prec * prec could be causing issues when the number is very small or when precision is very small. Also, when the number is already an integer, this approach should return the integer without any decimal points.\n\nAnother approach is to multiply the number by 10^precision, round it to the nearest integer, then divide by 10^precision.\n\nBut in Python, for large numbers, multiplying by 10^precision might cause floating point inaccuracies, but I think for the purpose of this function, it's acceptable.\n\nAlternatively, we can use the round() function in a clever way. For example, multiplying the number by 10^precision, rounding to the nearest integer, then dividing by 10^precision.\n\nSo, another formula is: rounded = round(number * (10 ** precision)) / (10 ** precision)\n\nBut we have to handle the ca

# Tree-Sitter

In [48]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from collections import defaultdict
import math
import re

class CodeSimilarityAnalyzer:
    def __init__(self):
        # Initialize Tree-sitter parser
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        
        # Keywords to preserve during normalization
        self.preserved_keywords = {
            'True', 'False', 'None', 'and', 'or', 'not', 'if', 'else', 'elif',
            'for', 'while', 'break', 'continue', 'def', 'class', 'return', 'import'
        }
        
        # Pre-compile queries
        self.identifier_query = self.language.query("""
            (identifier) @id
            (#not-eq? @id "True")
            (#not-eq? @id "False")
            (#not-eq? @id "None")
        """)
        
        self.function_query = self.language.query("""
            (function_definition
                name: (identifier) @name)
        """)

    def normalize_code(self, code):
        """Normalize variable/function names while preserving structure"""
        tree = self.parser.parse(bytes(code, 'utf8'))
        code_lines = code.splitlines(keepends=True)
        replacements = {}
        var_counter = 1
        func_counter = 1

        # Process function names
        for match in self.function_query.matches(tree.root_node):
            node = match[0][1]
            original = node.text.decode('utf8')
            if original not in replacements:
                replacements[original] = f"func{func_counter}"
                func_counter += 1

        # Process variable names
        for match in self.identifier_query.matches(tree.root_node):
            node = match[0][1]
            original = node.text.decode('utf8')
            
            # Skip preserved keywords and capitalized names
            if (original in self.preserved_keywords or 
                re.match(r'^[A-Z]', original) or
                original in replacements):
                continue
                
            replacements[original] = f"var{var_counter}"
            var_counter += 1

        # Apply replacements
        if replacements:
            # Process functions first
            for match in self.function_query.matches(tree.root_node):
                node = match[0][1]
                original = node.text.decode('utf8')
                if original in replacements:
                    start_line, start_col = node.start_point
                    end_line, end_col = node.end_point
                    if start_line == end_line:
                        line = code_lines[start_line]
                        code_lines[start_line] = (line[:start_col] + 
                                               replacements[original] + 
                                               line[end_col:])
            
            # Then process variables
            for match in self.identifier_query.matches(tree.root_node):
                node = match[0][1]
                original = node.text.decode('utf8')
                if original in replacements:
                    start_line, start_col = node.start_point
                    end_line, end_col = node.end_point
                    if start_line == end_line:
                        line = code_lines[start_line]
                        code_lines[start_line] = (line[:start_col] + 
                                               replacements[original] + 
                                               line[end_col:])

        return ''.join(code_lines)

    def extract_structural_features(self, code):
        """Extract AST patterns from normalized code"""
        tree = self.parser.parse(bytes(code, 'utf8'))
        features = defaultdict(float)
        
        def _walk(node, parent_type=None, depth=0):
            # Weight features by depth (shallow nodes matter more)
            weight = 1.0 / (1 + depth)
            
            # Node type with parent context
            if parent_type:
                feature_key = f"{parent_type}→{node.type}"
                features[feature_key] += weight
            
            # Node type alone
            features[node.type] += weight
            
            # Child relationships
            for child in node.children:
                _walk(child, node.type, depth + 1)
                
        _walk(tree.root_node)
        return features

    def cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between feature vectors"""
        all_features = set(vec1.keys()).union(set(vec2.keys()))
        dot_product = 0.0
        mag1 = 0.0
        mag2 = 0.0
        
        for feature in all_features:
            v1 = vec1.get(feature, 0.0)
            v2 = vec2.get(feature, 0.0)
            dot_product += v1 * v2
            mag1 += v1 ** 2
            mag2 += v2 ** 2
            
        mag1 = math.sqrt(mag1)
        mag2 = math.sqrt(mag2)
        
        if mag1 == 0 or mag2 == 0:
            return 0.0
            
        return min(max(dot_product / (mag1 * mag2), 0.0), 1.0)

    def compare_code(self, code1, code2):
        """Complete comparison pipeline"""
        # Step 1: Normalize both code snippets
        norm1 = self.normalize_code(code1)
        norm2 = self.normalize_code(code2)
        
        # Step 2: Parse normalized code
        try:
            # Step 3: Extract structural features
            features1 = self.extract_structural_features(norm1)
            features2 = self.extract_structural_features(norm2)
            
            # Step 4: Calculate similarity
            similarity = self.cosine_similarity(features1, features2)
            
            return {
                'similarity': similarity,
                'normalized_code1': norm1,
                'normalized_code2': norm2
            }
        except Exception as e:
            return {
                'error': f"Comparison failed: {str(e)}",
                'similarity': 0.0
            }

# Example Usage
if __name__ == "__main__":
    analyzer = CodeSimilarityAnalyzer()
    
    # Example 1: Similar structure, different names
    code1 = """
    def calculate_total(items):
        sum = 0
        for item in items:
            sum += item.price
        return sum
    """
    
    code2 = """
    def compute_sum(products):
        total = 0
        for product in products:
            total += product.cost
        return total
    """
    
    # Example 2: Different structure
    code3 = """
    def process_data(input_list):
        return [x*2 for x in input_list]
    """
    
    # Compare code
    result = analyzer.compare_code(code1, code2)
    print(f"Similarity between code1 and code2: {result['similarity']:.2f}")
    print("Normalized code1:")
    print(result['normalized_code1'])
    print("Normalized code2:")
    print(result['normalized_code2'])
    
    result = analyzer.compare_code(code1, code3)
    print(f"\nSimilarity between code1 and code3: {result['similarity']:.2f}")

    # For your specific comparison:
    PY_LANGUAGE = Language(tspython.language())
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    
    def get_function_names(code):
        """Extract function names from code"""
        tree = parser.parse(bytes(code, 'utf8'))
        query = PY_LANGUAGE.query("""
        (function_definition
            name: (identifier) @name)
        """)
        return [match[0][1].text.decode('utf8') for match in query.matches(tree.root_node)]
    
    def is_valid_python(code):
        """Check if code is valid Python syntax"""
        try:
            parser.parse(bytes(code, 'utf8'))
            return True
        except:
            return False
    
    # Example usage with your data:
    generated_code = "def example(): pass"  # Replace with actual code from csv2
    reference_code = "def sample(): pass"   # Replace with actual code from csv2
    
    print("\nValidity check:")
    print("Reference code valid:", is_valid_python(reference_code))
    print("Generated code valid:", is_valid_python(generated_code))
    
    print("\nFunction name comparison:")
    gen_func_names = get_function_names(generated_code)
    ref_func_names = get_function_names(reference_code)
    print("Generated function names:", gen_func_names)
    print("Reference function names:", ref_func_names)
    print("Match:", gen_func_names == ref_func_names)

Similarity: 0.00


KeyError: 'normalized_code1'

In [56]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from collections import defaultdict
import math
import re

class CodeSimilarityAnalyzer:
    def __init__(self):
        # Initialize Tree-sitter parser
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        
        # Keywords to preserve during normalization
        self.preserved_keywords = {
            'True', 'False', 'None', 'and', 'or', 'not', 'if', 'else', 'elif',
            'for', 'while', 'break', 'continue', 'def', 'class', 'return', 'import'
        }
        
        # Corrected queries with proper capture names
        self.identifier_query = self.language.query("""
            (identifier) @variable
            (#not-eq? @variable "True")
            (#not-eq? @variable "False")
            (#not-eq? @variable "None")
        """)
        
        self.function_query = self.language.query("""
            (function_definition
                name: (identifier) @function_name)
        """)

    
    def normalize_code(self, code):
        tree = self.parser.parse(bytes(code, 'utf8'))
        code_bytes = bytearray(code, 'utf8')
        replacements = {}
        var_counter = 1
        func_counter = 1

        # Capture function names
        function_captures = [
            (node, capture) for node, capture in self.function_query.captures(tree.root_node)
            if capture == "function_name"
        ]
        # Capture variable identifiers
        identifier_captures = [
            (node, capture) for node, capture in self.identifier_query.captures(tree.root_node)
            if capture == "variable"
        ]

        # Combine and sort all captures by start_byte DESCENDING (important!)
        all_captures = function_captures + identifier_captures
        all_captures.sort(key=lambda x: x[0].start_byte, reverse=True)

        for node, capture in all_captures:
            original = node.text.decode('utf8')
            
            # Skip preserved keywords and already replaced
            if original in self.preserved_keywords or re.match(r'^[A-Z]', original):
                continue

            if original not in replacements:
                if capture == "function_name":
                    replacements[original] = f"func{func_counter}"
                    func_counter += 1
                else:
                    replacements[original] = f"var{var_counter}"
                    var_counter += 1

            replacement = replacements[original]
            code_bytes[node.start_byte:node.end_byte] = replacement.encode("utf8")

        return code_bytes.decode("utf8")


    def extract_structural_features(self, code):
        """Extract AST patterns from normalized code"""
        tree = self.parser.parse(bytes(code, 'utf8'))
        features = defaultdict(float)
        
        def _walk(node, parent_type=None, depth=0):
            weight = 1.0 / (1 + depth)
            
            if parent_type:
                feature_key = f"{parent_type}→{node.type}"
                features[feature_key] += weight
            
            features[node.type] += weight
            
            for child in node.children:
                _walk(child, node.type, depth + 1)
                
        _walk(tree.root_node)
        return features

    def cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between feature vectors"""
        all_features = set(vec1.keys()).union(set(vec2.keys()))
        dot_product = 0.0
        mag1 = 0.0
        mag2 = 0.0
        
        for feature in all_features:
            v1 = vec1.get(feature, 0.0)
            v2 = vec2.get(feature, 0.0)
            dot_product += v1 * v2
            mag1 += v1 ** 2
            mag2 += v2 ** 2
            
        mag1 = math.sqrt(mag1)
        mag2 = math.sqrt(mag2)
        
        if mag1 == 0 or mag2 == 0:
            return 0.0
            
        return min(max(dot_product / (mag1 * mag2), 0.0), 1.0)

    def compare_code(self, code1, code2):
        """Complete comparison pipeline"""
        try:
            norm1 = self.normalize_code(code1)
            norm2 = self.normalize_code(code2)
            
            features1 = self.extract_structural_features(norm1)
            features2 = self.extract_structural_features(norm2)
            
            similarity = self.cosine_similarity(features1, features2)
            
            return {
                'similarity': similarity,
                'normalized_code1': norm1,
                'normalized_code2': norm2
            }
        except Exception as e:
            print(f"Error during comparison: {str(e)}")  # Debug output
            return {
                'error': f"Comparison failed: {str(e)}",
                'similarity': 0.0
            }

# Example Usage
if __name__ == "__main__":
    analyzer = CodeSimilarityAnalyzer()
    
    # Test case 1: Similar structure, different names
    code1 = """
def calculate(items):
    total = 0
    for item in items:
        total += item.price
    return total
"""
    
    code2 = """
def compute(products):
    sum = 0
    for product in products:
        sum += product.cost
    return sum
"""
    result = analyzer.compare_code(code1, code2)
    print(f"Similarity (should be high): {result['similarity']:.2f}")
    print("Normalized code 1:")
    print(result['normalized_code1'])
    print("Normalized code 2:")
    print(result['normalized_code2'])
    
    # Test case 2: Different structure
    code3 = """
def process(inputs):
    return [x*2 for x in inputs]
"""
    result = analyzer.compare_code(code1, code3)
    print(f"\nSimilarity (should be low): {result['similarity']:.2f}")

Error during comparison: too many values to unpack (expected 2)
Similarity (should be high): 0.00
Normalized code 1:


KeyError: 'normalized_code1'

In [59]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from collections import defaultdict
import math
import re

class CodeSimilarityAnalyzer:
    def __init__(self):
        try:
            # Initialize Tree-sitter parser
            self.language = Language(tspython.language())
            self.parser = Parser(self.language)
            
            # Keywords to preserve during normalization
            self.preserved_keywords = {
                'True', 'False', 'None', 'and', 'or', 'not', 'if', 'else', 'elif',
                'for', 'while', 'break', 'continue', 'def', 'class', 'return', 'import'
            }
            
            # Queries for capturing identifiers
            self.function_query = self.language.query("""
                (function_definition
                    name: (identifier) @func_name)
            """)
            
            self.variable_query = self.language.query("""
                (identifier) @variable
                (#not-eq? @variable "True")
                (#not-eq? @variable "False")
                (#not-eq? @variable "None")
            """)
            
        except Exception as e:
            raise RuntimeError(f"Failed to initialize analyzer: {str(e)}")

    def normalize_code(self, code):
        """Normalize variable/function names while preserving structure"""
        try:
            tree = self.parser.parse(bytes(code, 'utf8'))
            if not tree.root_node:
                return code
                
            # We'll build the normalized code by chunks
            chunks = []
            last_pos = 0
            replacements = {}
            var_counter = 1
            func_counter = 1

            # First pass: collect all identifiers that need replacement
            identifiers = []
            
            # Function names
            for node, tag in self.function_query.captures(tree.root_node):
                if tag == "func_name":
                    original = node.text.decode('utf8')
                    if original not in replacements:
                        replacements[original] = f"func{func_counter}"
                        func_counter += 1
                    identifiers.append((node.start_byte, node.end_byte, replacements[original]))
            
            # Variable names
            for node, tag in self.variable_query.captures(tree.root_node):
                if tag == "variable":
                    original = node.text.decode('utf8')
                    if (original not in replacements and 
                        original not in self.preserved_keywords and
                        not re.match(r'^[A-Z]', original)):
                        replacements[original] = f"var{var_counter}"
                        var_counter += 1
                    if original in replacements:
                        identifiers.append((node.start_byte, node.end_byte, replacements[original]))
            
            # Sort identifiers by position
            identifiers.sort()
            
            # Rebuild the code with replacements
            code_bytes = bytes(code, 'utf8')
            last_pos = 0
            for start, end, replacement in identifiers:
                chunks.append(code_bytes[last_pos:start])
                chunks.append(replacement.encode('utf8'))
                last_pos = end
            chunks.append(code_bytes[last_pos:])
            
            return b''.join(chunks).decode('utf8')
            
        except Exception as e:
            print(f"Normalization error: {str(e)}")
            return code

    def extract_structural_features(self, code):
        """Extract AST patterns from normalized code"""
        try:
            tree = self.parser.parse(bytes(code, 'utf8'))
            if not tree.root_node:
                return defaultdict(float)
                
            features = defaultdict(float)
            
            def _walk(node, parent_type=None, depth=0):
                weight = 1.0 / (1 + depth)
                
                if parent_type:
                    feature_key = f"{parent_type}→{node.type}"
                    features[feature_key] += weight
                
                features[node.type] += weight
                
                for child in node.children:
                    _walk(child, node.type, depth + 1)
                    
            _walk(tree.root_node)
            return features
            
        except Exception as e:
            print(f"Feature extraction error: {str(e)}")
            return defaultdict(float)

    def cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between feature vectors"""
        try:
            all_features = set(vec1.keys()).union(set(vec2.keys()))
            dot_product = 0.0
            mag1 = 0.0
            mag2 = 0.0
            
            for feature in all_features:
                v1 = vec1.get(feature, 0.0)
                v2 = vec2.get(feature, 0.0)
                dot_product += v1 * v2
                mag1 += v1 ** 2
                mag2 += v2 ** 2
                
            mag1 = math.sqrt(mag1)
            mag2 = math.sqrt(mag2)
            
            if mag1 == 0 or mag2 == 0:
                return 0.0
                
            return min(max(dot_product / (mag1 * mag2), 0.0), 1.0)
            
        except Exception as e:
            print(f"Similarity calculation error: {str(e)}")
            return 0.0

    def compare_code(self, code1, code2):
        """Complete comparison pipeline"""
        try:
            norm1 = self.normalize_code(code1)
            norm2 = self.normalize_code(code2)
            
            features1 = self.extract_structural_features(norm1)
            features2 = self.extract_structural_features(norm2)
            
            similarity = self.cosine_similarity(features1, features2)
            
            return {
                'similarity': similarity,
                'normalized_code1': norm1,
                'normalized_code2': norm2,
                'error': None
            }
            
        except Exception as e:
            error_msg = f"Comparison failed: {str(e)}"
            print(error_msg)
            return {
                'similarity': 0.0,
                'normalized_code1': code1,
                'normalized_code2': code2,
                'error': error_msg
            }


if __name__ == "__main__":
    try:
        analyzer = CodeSimilarityAnalyzer()
        
        # Test case 1: Similar structure, different names
        code1 = """
def calculate(items):
    total = 0
    for item in items:
        total += item.price
    return total
"""
        code2 = """
def compute(products):
    sum = 0
    for product in products:
        sum += product.cost
    return sum
"""
        result = analyzer.compare_code(code1, code2)
        print("\nTest 1 - Similar structure:")
        print(f"Similarity: {result['similarity']:.2f}")
        print("Normalized code 1:")
        print(result['normalized_code1'])
        print("Normalized code 2:")
        print(result['normalized_code2'])
        
        # Test case 2: Different structure
        code3 = """
def process(inputs):
    return [x*2 for x in inputs]
"""
        result = analyzer.compare_code(code1, code3)
        print("\nTest 2 - Different structure:")
        print(f"Similarity: {result['similarity']:.2f}")
        
    except Exception as e:
        print(f"Initialization failed: {str(e)}")

Initialization failed: Failed to initialize analyzer: 'tree_sitter.Parser' object has no attribute 'set_language'


In [8]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser
from collections import defaultdict
import math
import re

class PythonCodeComparator:
    def __init__(self):
        # Initialize Python parser
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        self.parser.set_language()
        
        # Python-specific configuration
        self.PYTHON_KEYWORDS = {
            'False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await',
            'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except',
            'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is',
            'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return',
            'try', 'while', 'with', 'yield'
        }
        
        # Python-specific AST queries
        self._setup_queries()
        
    def _setup_queries(self):
        """Initialize Python-specific Tree-sitter queries"""
        self.function_query = self.language.query("""
            (function_definition
                name: (identifier) @function_name) @function
            (parameters (identifier) @parameter)
        """)
        
        self.variable_query = self.language.query("""
            (identifier) @variable
            (#not-any-eq? @variable "True" "False" "None")
        """)
        
        self.class_query = self.language.query("""
            (class_definition
                name: (identifier) @class_name) @class
        """)
    
    def _should_normalize(self, identifier):
        """Check if an identifier should be normalized"""
        return (identifier not in self.PYTHON_KEYWORDS and
                not re.match(r'^__\w+__$', identifier) and  # Skip dunder methods
                not identifier.startswith('_'))  # Skip private members
    
    def normalize_python_code(self, code):
        """Normalize Python code while preserving structure"""
        try:
            tree = self.parser.parse(bytes(code, 'utf8'))
            if not tree.root_node:
                return code
                
            # Track replacements
            replacements = {}
            name_counters = {
                'function': 1,
                'class': 1,
                'parameter': 1,
                'variable': 1
            }
            
            # Process classes first
            for capture in self.class_query.captures(tree.root_node):
                node = capture[0]
                name = capture[1]
                if name == 'class_name':
                    original = node.text.decode('utf8')
                    if self._should_normalize(original):
                        replacements[(node.start_byte, node.end_byte)] = f"Class{name_counters['class']}"
                        name_counters['class'] += 1
            
            # Process functions and parameters
            for capture in self.function_query.captures(tree.root_node):
                node = capture[0]
                name = capture[1]
                original = node.text.decode('utf8')
                if name == 'function_name' and self._should_normalize(original):
                    replacements[(node.start_byte, node.end_byte)] = f"func{name_counters['function']}"
                    name_counters['function'] += 1
                elif name == 'parameter' and self._should_normalize(original):
                    replacements[(node.start_byte, node.end_byte)] = f"param{name_counters['parameter']}"
                    name_counters['parameter'] += 1
            
            # Process other variables
            for capture in self.variable_query.captures(tree.root_node):
                node = capture[0]
                name = capture[1]
                if name == 'variable':
                    original = node.text.decode('utf8')
                    if self._should_normalize(original) and (node.start_byte, node.end_byte) not in replacements:
                        replacements[(node.start_byte, node.end_byte)] = f"var{name_counters['variable']}"
                        name_counters['variable'] += 1
            
            # Apply replacements in reverse order
            sorted_replacements = sorted(replacements.items(), key=lambda x: x[0][0], reverse=True)
            code_bytes = bytearray(code, 'utf8')
            
            for (start, end), new_name in sorted_replacements:
                code_bytes[start:end] = new_name.encode('utf8')
            
            return code_bytes.decode('utf8')
            
        except Exception as e:
            print(f"Normalization error: {str(e)}")
            return code
    
    # [Rest of the methods remain unchanged]
    def get_structural_features(self, code):
        """Extract Python-specific structural features"""
        try:
            tree = self.parser.parse(bytes(code, 'utf8'))
            if not tree.root_node:
                return defaultdict(float)
                
            features = defaultdict(float)
            
            def _walk(node, context=None):
                # Python-specific feature weighting
                weight = 1.0 / (1 + node.start_point[0])  # Weight by line depth
                
                # Track node type with context
                if context:
                    features[f"{context}>{node.type}"] += weight
                
                # Python-specific features
                if node.type == "function_definition":
                    features["function"] += weight * 2  # Extra weight for functions
                elif node.type == "class_definition":
                    features["class"] += weight * 1.5
                elif node.type == "list_comprehension":
                    features["comprehension"] += weight
                
                features[node.type] += weight
                
                # Walk children with current node as context
                for child in node.children:
                    _walk(child, node.type)
                    
            _walk(tree.root_node)
            return features
            
        except Exception as e:
            print(f"Feature extraction error: {str(e)}")
            return defaultdict(float)
    
    def compare(self, code1, code2):
        """Compare two Python code snippets"""
        try:
            # Normalize both snippets
            norm1 = self.normalize_python_code(code1)
            norm2 = self.normalize_python_code(code2)
            
            # Extract features
            features1 = self.get_structural_features(norm1)
            features2 = self.get_structural_features(norm2)
            
            # Calculate similarity
            similarity = self._cosine_similarity(features1, features2)
            
            return {
                'similarity': round(similarity, 2),
                'normalized1': norm1,
                'normalized2': norm2
            }
        except Exception as e:
            return {
                'similarity': 0.0,
                'error': str(e)
            }
    
    def _cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between feature vectors"""
        all_features = set(vec1.keys()).union(set(vec2.keys()))
        dot = sum(vec1.get(f, 0) * vec2.get(f, 0) for f in all_features)
        mag1 = math.sqrt(sum(v**2 for v in vec1.values()))
        mag2 = math.sqrt(sum(v**2 for v in vec2.values()))
        return dot / (mag1 * mag2) if (mag1 * mag2) > 0 else 0.0


# Test Cases
if __name__ == "__main__":
    comparator = PythonCodeComparator()
    
    # Test 1: Similar functions
    code1 = """
def calculate_total(items):
    result = 0
    for item in items:
        result += item.price
    return result
"""
    code2 = """
def compute_sum(products):
    total = 0
    for product in products:
        total += product.cost
    return total
"""
    result = comparator.compare(code1, code2)
    print("=== Test 1: Similar Functions ===")
    print(f"Similarity: {result['similarity']}")
    print("Normalized 1:")
    print(result['normalized1'])
    print("Normalized 2:")
    print(result['normalized2'])

Normalization error: 'str' object has no attribute 'text'
Normalization error: 'str' object has no attribute 'text'
=== Test 1: Similar Functions ===
Similarity: 1.0
Normalized 1:

def calculate_total(items):
    result = 0
    for item in items:
        result += item.price
    return result

Normalized 2:

def compute_sum(products):
    total = 0
    for product in products:
        total += product.cost
    return total



In [2]:
from tree_sitter import Parser, Language
import os
import tree_sitter_python as tspython

# Initialize Tree-sitter (run this once)
def initialize_parser(language_name):
    # Point this to your tree-sitter languages repository
    language = Language(tspython.language())
    return language

# For Python (modify for other languages)
PYTHON_LANGUAGE = initialize_parser('python')

class CodeComparator:
    def __init__(self, language=PYTHON_LANGUAGE):
        self.parser = Parser(language)
        
        self.identifier_types = {
            'python': {
                'variable': ['identifier', 'variable_name'],
                'function': ['function_definition>identifier', 'call>identifier'],
                'class': ['class_definition>identifier'],
                'parameter': ['parameters>identifier', 'lambda_parameters>identifier'],
                'attribute': ['attribute>identifier'],
                'constant': ['identifier&uppercase']
            }
            # Add other language configurations here
        }
        # Initialize counters here to avoid attribute errors
        self.counters = {
            'variable': 1,
            'function': 1,
            'class': 1,
            'parameter': 1,
            'attribute': 1,
            'constant': 1
        }
        self.scopes = [{}]  # Stack of scopes for variable tracking
    
    def parse_code(self, code_str):
        """Parse code string into AST"""
        return self.parser.parse(bytes(code_str, 'utf8'))
    
    def normalize_ast(self, node, language='python'):
        """Normalize AST by replacing all identifiers with generic names"""
        normalized = {'type': node.type}
        
        # Check if this node is an identifier that needs normalization
        identifier_type = self.get_identifier_type(node, language)
        if identifier_type:
            original_name = node.text.decode('utf8')
            
            # Check if we've seen this name in current scope
            if original_name in self.scopes[-1]:
                normalized['text'] = self.scopes[-1][original_name]
            else:
                # Create new normalized name
                norm_name = f"{identifier_type}_{self.counters[identifier_type]}"
                self.counters[identifier_type] += 1
                self.scopes[-1][original_name] = norm_name
                normalized['text'] = norm_name
        
        # Handle scoped constructs (functions, classes, etc.)
        if node.type == 'function_definition':
            self.scopes.append({})  # New scope for function
            normalized['children'] = [self.normalize_ast(child, language) 
                                    for child in node.children]
            self.scopes.pop()
        elif node.type == 'class_definition':
            self.scopes.append({})  # New scope for class
            normalized['children'] = [self.normalize_ast(child, language) 
                                    for child in node.children]
            self.scopes.pop()
        else:
            # Process children normally
            normalized['children'] = [self.normalize_ast(child, language) 
                                    for child in node.children]
        
        return normalized
    
    def get_identifier_type(self, node, language):
        """Determine what kind of identifier this node represents"""
        rules = self.identifier_types.get(language, {})
        
        # Check variable rules
        if node.type in rules.get('variable', []):
            return 'var'
        
        # Check if this is a function name
        if (node.parent and node.parent.type in ['function_definition', 'call'] and 
            node == node.parent.child_by_field_name('name')):
            return 'func'
        
        # Check if this is a class name
        if (node.parent and node.parent.type == 'class_definition' and 
            node == node.parent.child_by_field_name('name')):
            return 'class'
        
        # Check if this is a parameter
        if (node.parent and node.parent.type in ['parameters', 'lambda_parameters']):
            return 'param'
        
        # Check if this is an attribute
        if (node.parent and node.parent.type == 'attribute' and 
            node == node.parent.child_by_field_name('attribute')):
            return 'attr'
        
        # Check for constants (uppercase)
        if node.text.decode('utf8').isupper():
            return 'CONST'
        
        return None
    
    def compare_code(self, code1, code2, language='python'):
        """Compare two code strings after normalization"""
        # Reset state for fresh comparison
        self.counters = {
            'variable': 1,
            'function': 1,
            'class': 1,
            'parameter': 1,
            'attribute': 1,
            'constant': 1
        }
        self.scopes = [{}]
        
        # Parse and normalize first code
        tree1 = self.parse_code(code1)
        norm_ast1 = self.normalize_ast(tree1.root_node, language)
        
        # Reset state again for second code
        self.counters = {
            'variable': 1,
            'function': 1,
            'class': 1,
            'parameter': 1,
            'attribute': 1,
            'constant': 1
        }
        self.scopes = [{}]
        
        # Parse and normalize second code
        tree2 = self.parse_code(code2)
        norm_ast2 = self.normalize_ast(tree2.root_node, language)
        
        # Compare normalized ASTs
        return self.compare_asts(norm_ast1, norm_ast2)
    
    def compare_asts(self, ast1, ast2):
        """Recursively compare two normalized ASTs"""
        if ast1['type'] != ast2['type']:
            return False
        
        # Compare normalized identifiers
        if 'text' in ast1 or 'text' in ast2:
            if 'text' not in ast1 or 'text' not in ast2:
                return False
            # Only compare the prefix (var1 vs var2 should match)
            if ast1['text'].split('_')[0] != ast2['text'].split('_')[0]:
                return False
        
        # Compare children
        if len(ast1.get('children', [])) != len(ast2.get('children', [])):
            return False
        
        for child1, child2 in zip(ast1.get('children', []), ast2.get('children', [])):
            if not self.compare_asts(child1, child2):
                return False
        
        return True

# Example usage
if __name__ == "__main__":
    comparator = CodeComparator()
    
    code1 = """
    def calculate_sum(a, b):
        result = a + b
        return result
    
    class DataProcessor:
        def __init__(self, data):
            self.data = data
    """
    
    code2 = """
    def compute_total(x, y):
        total = x + y
        return total
    
    class InfoHandler:
        def __init__(self, info):
            self.info = info
    """
    
    # These should be structurally identical
    code3 = """
    def func1(param1, param2):
        var1 = param1 + param2
        return var1
    
    class Class1:
        def __init__(self, param1):
            self.attr1 = param1
    """
    
    print("Comparing code1 and code2:", comparator.compare_code(code1, code2))  # True
    print("Comparing code1 and code3:", comparator.compare_code(code1, code3))  # True
    
    # Different structure
    code4 = """
    def process_data(input):
        return input * 2
    """
    print("Comparing code1 and code4:", comparator.compare_code(code1, code4))  # False

KeyError: 'var'

In [10]:
from tree_sitter import Parser, Language
import os
import tree_sitter_python as tspython
import pandas as pd

class ImprovedCodeComparator:
    def __init__(self, language='python'):
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        
        # Track identifier usage across scopes
        self.scope_stack = [defaultdict(int)]
        self.current_scope = self.scope_stack[-1]
        self.normalized_code = []
        
        # Configuration
        self.special_tokens = {'True', 'False', 'None'}

    def normalize_code(self, code_str):
        """Normalize code with proper scoping"""
        
        tree = self.parser.parse(bytes(code_str, 'utf8'))
        self._normalize_node(tree.root_node)
        return ' '.join(self.normalized_code)

    def _normalize_node(self, node):
        """Recursive normalization with scope handling"""
        if node.type == 'identifier':
            text = node.text.decode('utf8')
            
            if text in self.special_tokens:
                self.normalized_code.append(text)
                return
                
            # Get normalized name based on scope
            norm_name = self._get_normalized_name(node)
            self.normalized_code.append(norm_name)
            return
            
        # Handle new scopes
        if node.type in ('function_definition', 'class_definition', 'block'):
            self.scope_stack.append(defaultdict(int))
            self.current_scope = self.scope_stack[-1]
            
        for child in node.children:
            self._normalize_node(child)
            
        # Exit scope
        if node.type in ('function_definition', 'class_definition', 'block'):
            self.scope_stack.pop()
            self.current_scope = self.scope_stack[-1]

    def _get_normalized_name(self, node):
        """Get normalized identifier name with proper scoping"""
        text = node.text.decode('utf8')
        parent = node.parent
        
        # Determine identifier type
        if parent.type == 'function_definition' and node == parent.child_by_field_name('name'):
            prefix = 'func'
        elif parent.type == 'class_definition' and node == parent.child_by_field_name('name'):
            prefix = 'class'
        elif parent.type in ('parameters', 'lambda_parameters'):
            prefix = 'param'
        elif text.isupper():
            prefix = 'CONST'
        else:
            prefix = 'var'
            
        # Track usage in current scope
        self.current_scope[(prefix, text)] += 1
        count = self.current_scope[(prefix, text)]
        
        return f"{prefix}_{count}"

    def calculate_similarity(self, norm1, norm2):
        """Calculate similarity between normalized code strings"""
        tokens1 = norm1.split()
        tokens2 = norm2.split()
        
        # Create frequency maps
        freq1 = defaultdict(int)
        freq2 = defaultdict(int)
        
        for token in tokens1:
            freq1[token] += 1
        for token in tokens2:
            freq2[token] += 1
            
        # Calculate intersection
        common = 0
        for token in freq1:
            common += min(freq1[token], freq2[token])
            
        # Return similarity score (0.0 to 1.0)
        max_len = max(len(tokens1), len(tokens2))
        return common / max_len if max_len > 0 else 0.0

# Example usage
if __name__ == "__main__":
    try:
        comparator = CodeComparator()
        df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/code generation/deepseek_cleaned_code_results_dir/split_part_0_results_results.csv")
        df = df.iloc[0]
        code1 = df["Generated_Code_deepseek_2_code1"]
        code2 = df["Generated_Code_deepseek_2_code2"]
        code3 = df["Generated_Code_deepseek_2_code3"]
        print("Code : \n",code1,"\n")
        print("Code : \n",code2,"\n")
        print("Code : \n",code3,"\n")
        # Test cases
        # code1 = """
        # def calculate_sum(a, b):
        #     result = a + b
        #     return result
        # """
        
        # code2 = """
        # def compute_total(x, y):
        #     total = x + y
        #     return total
        # """
        
        # code3 = """
        # def process_data(input):
        #     return input * 2
        # """
        
        # Compare similar functions
        result = comparator.compare_code(code1, code2)
        print(f"Similarity between code1 and code2: {result['similarity']:.2f}")
        print("Normalized 1:", result['normalized_code1'])
        print("Normalized 2:", result['normalized_code2'])
        
        # Compare different functions
        result = comparator.compare_code(code1, code3)
        print(f"\nSimilarity between code1 and code3: {result['similarity']:.2f}")
        print("Normalized 1:", result['normalized_code1'])
        print("Normalized 2:", result['normalized_code2'])
        
    except Exception as e:
        print(f"Initialization failed: {str(e)}")

Code : 
 def writeBoolean(self, n):
    t = TYPE_BOOL 

Code : 
 def writeBoolean(self, n):
    t = TYPE_BOOL_TRUE
    if n == 0:
        t = TYPE_BOOL_FALSE
    self.stream.write(t)
    return t 

Code : 
 def writeBoolean(self, n):
    t = TYPE_BOOL_TRUE
    if n == False:
        t = TYPE_BOOL_FALSE
    self.stream.write(t)
    return t 

Similarity between code1 and code2: 1.00
Normalized 1: func_1 param_1 param_2 var_1 const_1 
Normalized 2: func_1 param_1 param_2 var_1 const_1 param_2 var_1 const_2 param_1 attr_1 attr_2 var_1 var_1 

Similarity between code1 and code3: 1.00
Normalized 1: func_1 param_1 param_2 var_1 const_1 
Normalized 2: func_1 param_1 param_2 var_1 const_1 param_2 var_1 const_2 param_1 attr_1 attr_2 var_1 var_1 


In [11]:
from tree_sitter import Parser, Language
from collections import defaultdict
import math
import tree_sitter_python as tspython
import pandas as pd
       

class AccurateCodeComparator:
    def __init__(self, language='python'):
        # Initialize parser
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        
        # Special tokens to preserve
        self.preserved_tokens = {'True', 'False', 'None', '0', '1'}
        
        # For similarity calculation
        self.weights = {
            'function_def': 0.3,
            'control_flow': 0.25,
            'operations': 0.2,
            'literals': 0.15,
            'returns': 0.1
        }

    def normalize_code(self, code_str):
        """Normalize code while preserving structure"""
        tree = self.parser.parse(bytes(code_str, 'utf8'))
        features = self._extract_features(tree.root_node)
        return features

    def _extract_features(self, node):
        """Extract structural features from AST"""
        features = defaultdict(int)
        
        if node.type == 'function_definition':
            features['function_def'] += 1
            # Don't normalize function name for comparison
            fn_name = node.child_by_field_name('name')
            if fn_name:
                features[f'fn_name:{fn_name.text.decode()}'] += 1
            
        elif node.type in ('if_statement', 'for_statement', 'while_statement'):
            features['control_flow'] += 1
            
        elif node.type in ('binary_operator', 'unary_operator'):
            features['operations'] += 1
            op = node.child_by_field_name('operator')
            if op:
                features[f'op:{op.text.decode()}'] += 1
                
        elif node.type == 'return_statement':
            features['returns'] += 1
            
        elif node.type == 'identifier':
            text = node.text.decode('utf8')
            if text in self.preserved_tokens:
                features[f'literal:{text}'] += 1
            else:
                # Normalize other identifiers
                features['identifier'] += 1
                
        # Recursively process children
        for child in node.children:
            child_features = self._extract_features(child)
            for k, v in child_features.items():
                features[k] += v
                
        return features

    def calculate_similarity(self, features1, features2):
        """Calculate weighted similarity between feature sets"""
        all_keys = set(features1.keys()) | set(features2.keys())
        similarity = 0.0
        total_weight = 0.0
        
        for key in all_keys:
            # Determine feature weight
            if key.startswith('fn_name:'):
                weight = self.weights['function_def'] * 0.5
            elif key.startswith('op:'):
                weight = self.weights['operations'] * 0.3
            elif key.startswith('literal:'):
                weight = self.weights['literals']
            elif key == 'function_def':
                weight = self.weights['function_def']
            elif key == 'control_flow':
                weight = self.weights['control_flow']
            elif key == 'operations':
                weight = self.weights['operations'] * 0.7
            elif key == 'returns':
                weight = self.weights['returns']
            else:
                weight = 0.05  # Default low weight
                
            # Calculate feature similarity
            val1 = features1.get(key, 0)
            val2 = features2.get(key, 0)
            similarity += min(val1, val2) * weight
            total_weight += max(val1, val2) * weight
            
        return similarity / total_weight if total_weight > 0 else 0.0

# Example usage
if __name__ == "__main__":
    comparator = AccurateCodeComparator()
    
    code1 = """def writeBoolean(self, n):
    t = TYPE_BOOL"""
    
    code2 = """def writeBoolean(self, n):
    t = TYPE_BOOL_TRUE
    if n == 0:
        t = TYPE_BOOL_FALSE
    self.stream.write(t)
    return t"""
    
    code3 = """def writeBoolean(self, n):
    t = TYPE_BOOL_TRUE
    if n == False:
        t = TYPE_BOOL_FALSE
    self.stream.write(t)
    return t"""
    
    # Extract features
    features1 = comparator.normalize_code(code1)
    features2 = comparator.normalize_code(code2)
    features3 = comparator.normalize_code(code3)
    
    # Calculate similarities
    sim1_2 = comparator.calculate_similarity(features1, features2)
    sim1_3 = comparator.calculate_similarity(features1, features3)
    sim2_3 = comparator.calculate_similarity(features2, features3)
    
    print(f"Similarity between code1 and code2: {sim1_2:.2f}")
    print(f"Similarity between code1 and code3: {sim1_3:.2f}")
    print(f"Similarity between code2 and code3: {sim2_3:.2f}")

Similarity between code1 and code2: 0.48
Similarity between code1 and code3: 0.48
Similarity between code2 and code3: 1.00


In [12]:
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/code generation/deepseek_cleaned_code_results_dir/split_part_0_results_results.csv")
df = df.iloc[1]
code1 = df["Original_Code"]
code2 = df["Generated_Code_deepseek_2_code1"]
code3 = df["Generated_Code_deepseek_2_code2"]

features1 = comparator.normalize_code(code1)
features2 = comparator.normalize_code(code2)
features3 = comparator.normalize_code(code3)

# Calculate similarities
sim1_2 = comparator.calculate_similarity(features1, features2)
sim1_3 = comparator.calculate_similarity(features1, features3)

print(f"Similarity between code1 and code2: {sim1_2:.2f}")
print(f"Similarity between code1 and code3: {sim1_3:.2f}")

Similarity between code1 and code2: 0.45
Similarity between code1 and code3: 0.42


In [13]:
print(features1)

defaultdict(<class 'int'>, {'function_def': 1, 'fn_name:paste': 1, 'identifier': 15, 'returns': 1})


In [8]:
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/code generation/deepseek_cleaned_code_results_dir/split_part_0_results_results.csv")
df.head()

Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1_code1,Generated_Code_deepseek_1_code2,Generated_Code_deepseek_1_code3,Generated_Code_deepseek_2_code1,Generated_Code_deepseek_2_code2,Generated_Code_deepseek_2_code3,Generated_Code_deepseek_3_code1,Generated_Code_deepseek_3_code2,Generated_Code_deepseek_3_code3,Generated_Code_deepseek_4_code1,Generated_Code_deepseek_4_code2,Generated_Code_deepseek_4_code3
0,d1,q1,"def writeBoolean(self, n):\n """"""\n ...",def flip_stream(condition):\n # code,def f(n):\n # ... code ...,,"def writeBoolean(self, n):\n t = TYPE_BOOL","def writeBoolean(self, n):\n t = TYPE_BOOL_...","def writeBoolean(self, n):\n t = TYPE_BOOL_...",def writeBoolean(n):\n t = TYPE_BOOL_TRUE,def writeBoolean(n):\n # Initialize t to TY...,def writeBoolean(n):\n t = TYPE_BOOL_TRUE\n...,def writeBoolean(n):\n t = TYPE_BOOL_TRUE\n...,,
1,d2,q2,"def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n selection = ""prima...",,"def paste(xsel=False):\n\n """"""Pastes data f...",def paste(xsel=None):\n # Line 2: set selec...,def paste(xsel=None):\n # ... code here ...,def paste(xsel=None):\n if xsel is None:\n ...,def paste(xsel=None):\n if xsel is not None...,def paste(xsel=None):\n if xsel is None:\n ...,def paste(xsel=None):,def paste(xsel=None):\n # ... code ...\n ...,def paste(xsel=None):\n # ... code here ......,def paste(xsel=None):
2,d3,q3,"def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n return jso...","def format_json(data, theme):\n # ... code ...","def _format_json(data, theme):\n # Code\n ...","def _format_json(data, theme):\n output = j...","pygments.highlight(output, JsonLexer()","def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n try:\n ...","def _format_json(data, theme):\n # ... code...","def _format_json(data, theme):\n # ... code...","def _format_json(data, theme):\n\n if sys.o...","def format_json(data, theme):\n # ... code ...","def _format_json(data, theme):\n # ... code..."
3,d4,q4,"def create_path(path):\n """"""Creates a absol...",,"def f(a, b, c, d, e, f, g, h, i, j, k, l, m, n...",def absolute_path(relative_path):\n # code ...,def create_path(path):\n if not os.path.exi...,def create_path(path):\n if os.path.isabs(p...,def create_path(path):\n # Check if the pat...,def file_path(file_path):\n # ... code ...\...,"def __init__(self, path):\n # ... code here...",def __file__(path):\n # ... code here ...,def create_path(path):\n import os\n if ...,def create_path(path):\n import os\n if ...,def create_path(path):\n import os\n if ...
4,d5,q5,"def _vector_or_scalar(x, type='row'):\n """"""...",def check_array(x):\n # code here\n retu...,np.array(x),def array_to_column_vector(x):\n # Check if...,"def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n # c...",np.column_stack(),"def _vector_or_scalar(x, type=None):\n # co...","def _vector_or_scalar(x, type=None):\n # co...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n # C...","def _vector_or_scalar(x, type='row'):\n # c..."


In [18]:
from tree_sitter import Parser, Language
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import tree_sitter_python as tspython

class CodeSimilarityCalculator:
    def __init__(self, language='python'):
        # Initialize parser
        self.language = Language(tspython.language())
        self.parser = Parser(self.language)
        
        # For normalization
        self.identifier_counter = 1
        self.identifier_map = {}
        self.scope_stack = [{}]
        
        # Special tokens to preserve
        self.preserved_tokens = {'True', 'False', 'None', '0', '1', 'if', 'else', 'for', 'while', 'return'}
        
        # For vectorization
        self.vectorizer = TfidfVectorizer(tokenizer=self._tokenize_code, lowercase=False)

    def normalize_code(self, code_str):
        """Normalize code by generalizing identifiers"""
        self.identifier_counter = 1
        self.identifier_map = {}
        self.scope_stack = [{}]
        
        tree = self.parser.parse(bytes(code_str, 'utf8'))
        normalized = self._normalize_node(tree.root_node)
        return self._structure_to_text(normalized)

    def _normalize_node(self, node):
        """Recursively normalize AST nodes"""
        normalized = {'type': node.type}
        
        # Handle identifiers
        if node.type == 'identifier':
            text = node.text.decode('utf8')
            if text in self.preserved_tokens:
                return {'type': 'literal', 'value': text}
            return {'type': 'identifier', 'name': self._get_normalized_name(node)}
        
        # Handle new scopes
        if node.type in ('function_definition', 'class_definition', 'block'):
            self.scope_stack.append({})
            
        # Process children
        normalized['children'] = []
        for child in node.children:
            norm_child = self._normalize_node(child)
            if norm_child:
                normalized['children'].append(norm_child)
        
        # Exit scope
        if node.type in ('function_definition', 'class_definition', 'block'):
            self.scope_stack.pop()
            
        return normalized

    def _get_normalized_name(self, node):
        """Generate consistent normalized names"""
        text = node.text.decode('utf8')
        parent = node.parent
        
        # Check current scope first
        for scope in reversed(self.scope_stack):
            if text in scope:
                return scope[text]
        
        # Determine type prefix
        if parent.type == 'function_definition' and node == parent.child_by_field_name('name'):
            prefix = 'FUNC'
        elif parent.type == 'class_definition' and node == parent.child_by_field_name('name'):
            prefix = 'CLASS'
        elif parent.type in ('parameters', 'lambda_parameters'):
            prefix = 'PARAM'
        elif text.isupper():
            prefix = 'CONST'
        else:
            prefix = 'VAR'
            
        # Create and store normalized name
        norm_name = f"{prefix}_{self.identifier_counter}"
        self.identifier_counter += 1
        self.scope_stack[-1][text] = norm_name
        return norm_name

    def _structure_to_text(self, node):
        """Convert normalized structure to comparable text"""
        if node['type'] == 'identifier':
            return node['name']
        elif node['type'] == 'literal':
            return node['value']
        
        parts = [node['type']]
        for child in node.get('children', []):
            parts.append(self._structure_to_text(child))
        return ' '.join(parts)

    def _tokenize_code(self, code_text):
        """Tokenize normalized code for vectorization"""
        # Split by operators, brackets, etc. while preserving tokens
        tokens = re.findall(r'[A-Za-z_][A-Za-z0-9_]*|[0-9]+|\S', code_text)
        return [t for t in tokens if t.strip()]

    def cosine_similarity(self, code1, code2):
        """Calculate cosine similarity between two code snippets"""
        norm1 = self.normalize_code(code1)
        norm2 = self.normalize_code(code2)
        print("Norm 1", norm1)
        print("Norm 2", norm2)
        
        # Fit vectorizer and transform
        vectors = self.vectorizer.fit_transform([norm1, norm2])
        return cosine_similarity(vectors[0], vectors[1])[0][0]

# Example usage
if __name__ == "__main__":
    comparator = CodeSimilarityCalculator()
    
    # Similar functions
    code1 = """
    def calculate(x, y):
        result = x + y
        return result
    """
    
    code2 = """
    def compute(a, b):
        total = a + b
        return total
    """
    
    # Different function
    code3 = """
    def process_data(input):
        if input > 0:
            return input * 2
        return 0
    """
    df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/code generation/deepseek_cleaned_code_results_dir/split_part_0_results_results.csv")
    df = df.iloc[1]
    print("Similarity (code1 vs code2):", 
          comparator.cosine_similarity(code1, code2))  # Should be high (~0.8-1.0)
    
    print("Similarity (code1 vs code3):", 
          comparator.cosine_similarity(code1, code3))  # Should be low (~0.1-0.3)

Norm 1 module function_definition def FUNC_1 parameters ( PARAM_2 , PARAM_3 ) : block expression_statement assignment VAR_4 = binary_operator PARAM_2 + PARAM_3 return_statement return VAR_4
Norm 2 module function_definition def FUNC_1 parameters ( PARAM_2 , PARAM_3 ) : block expression_statement assignment VAR_4 = binary_operator PARAM_2 + PARAM_3 return_statement return VAR_4
Similarity (code1 vs code2): 0.9999999999999998
Norm 1 module function_definition def FUNC_1 parameters ( PARAM_2 , PARAM_3 ) : block expression_statement assignment VAR_4 = binary_operator PARAM_2 + PARAM_3 return_statement return VAR_4
Norm 2 module function_definition def FUNC_1 parameters ( PARAM_2 ) : block if_statement if comparison_operator PARAM_2 > integer : block return_statement return binary_operator PARAM_2 * integer return_statement return integer
Similarity (code1 vs code3): 0.4375526574551408




In [47]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/code_generation/cleaned_deepseek_exps_results_dir/split_part_0_results.csv")
df.head()

Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1,Generated_Code_deepseek_2,Generated_Code_deepseek_3,Generated_Code_deepseek_4,Generated_Code_granite_1,Generated_Code_granite_2,Generated_Code_granite_3,Generated_Code_granite_4
0,d1,q1,"def writeBoolean(self, n):\n """"""\n ...",```python\ndef flip_stream(n):\n current_va...,"def writeBoolean(self, n: int) -> None:\n i...",```python\ndef writeBoolean(n):\n t = TYPE_...,The Python function is named writeBoolean and ...,"```python\ndef writeBoolean(self, n):\n """"""...","```python\ndef writeBoolean(self, n):\n """"""...","```python\ndef write_bool(stream, n):\n t =...","```python\ndef writeBoolean(self, n):\n """"""..."
1,d2,q2,"def paste(xsel=False):\n """"""Returns system ...","```python\ndef paste(xsel=False):\n """"""Past...",```python\ndef paste(xsel=None):\n selectio...,```python\ndef paste(xsel=None):\n if xsel ...,```python\ndef paste(xsel=None):\n if xsel ...,```python\nimport subprocess\n\nclass XclipNot...,"```python\ndef paste(xsel=False):\n """"""Retu...",```python\nimport subprocess\n\ndef get_clipbo...,```python\nimport subprocess\n\ndef paste(xsel...
2,d3,q3,"def _format_json(data, theme):\n """"""Pretty ...","def format_json(data, theme):\n return json...","```python\ndef _format_json(data, theme):\n ...",```python\nimport json\n\ndef _format_json(dat...,```python\nimport json\n\ndef _format_json(dat...,```python\nimport json\nimport sys\nfrom pygme...,"```python\ndef _format_json(data, theme):\n ...",```python\nimport json\nimport pygments\nfrom ...,```python\nimport json\nimport sys\nimport pyg...
3,d4,q4,"def create_path(path):\n """"""Creates a absol...",def relative_to_absolute(s):\n return s.lst...,import os\n\ndef create_path(path):\n if no...,def create_path_system(path):\n import os\n...,```python\ndef create_path(path):\n import ...,```python\nimport os\n\ndef create_path(path):...,```python\nimport os\n\ndef create_path(path):...,```python\nimport os\n\ndef create_directory(p...,```python\nimport os\n\ndef create_path(path):...
4,d5,q5,"def _vector_or_scalar(x, type='row'):\n """"""...",```python\nimport numpy as np\n\ndef check_arr...,"def _vector_or_scalar(x, type='row'):\n if ...",```python\nimport numpy as np\n\ndef _vector_o...,"def _vector_or_scalar(x, type='row'):\n if ...",```python\nimport numpy as np\n\ndef _vector_o...,```python\nimport numpy as np\n\ndef _vector_o...,```python\nimport numpy as np\n\ndef convert_a...,```python\nimport numpy as np\n\ndef _vector_o...


In [55]:
import pandas as pd
model ="granite"
df = pd.read_csv(f"/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/code_generation/cleaned_{model}_exps_results_dir/split_part_0_results.csv")
for i in range(4):
    for j in range(3):
        df = df.rename(columns={f'Generated_Code_{model}_{i+1}': f'Generated_Code_{model}_{i+1}_code_{j+1}'})
df.to_csv(f"/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/code_generation/cleaned_{model}_exps_results_dir/split_part_0_results.csv",index = False)


# CodeBLEU

In [11]:
import subprocess
import tempfile
import os

def run_codebleu_on_strings(reference_codes, generated_code, lang='python', params='0.25,0.25,0.25,0.25', script_path='codebleu_eval.py'):
    # Step 1: Write reference code(s) to temp files
    ref_paths = []
    try:
        for ref_code in reference_codes:
            tmp_ref = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt')
            tmp_ref.write(ref_code.strip() + "\n")
            tmp_ref.flush()
            ref_paths.append(tmp_ref.name)
            tmp_ref.close()

        # Step 2: Write generated code to a temp file
        tmp_hyp = tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt')
        tmp_hyp.write(generated_code.strip() + "\n")
        tmp_hyp.flush()
        hyp_path = tmp_hyp.name
        tmp_hyp.close()

        # Step 3: Build and run subprocess
        cmd = ["python", script_path, "--refs", *ref_paths, "--hyp", hyp_path, "--lang", lang, "--params", params]
        result = subprocess.run(cmd, capture_output=True, text=True)

        print(result.stdout)  # Optional debug
        score_line = [line for line in result.stdout.splitlines() if 'CodeBLEU score:' in line]
        score = float(score_line[0].split()[-1]) if score_line else None

    finally:
        # Step 4: Clean up all temp files
        for path in ref_paths:
            os.remove(path)
        if 'hyp_path' in locals():
            os.remove(hyp_path)

    return score
ref_codes = [
    "def add(a, b): return a + b",
    "def sum(x, y): return x + y"
]

generated = "def add(a, b): return a + b"

score = run_codebleu_on_strings(ref_codes, generated)
print("Final CodeBLEU Score:", score)



Final CodeBLEU Score: None


In [9]:
import pandas as pd
pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/granite_cleaned_code_results_dir/split_part_0_results_results.csv")

Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_granite_1_code1,Generated_Code_granite_1_code2,Generated_Code_granite_1_code3,Generated_Code_granite_2_code1,Generated_Code_granite_2_code2,Generated_Code_granite_2_code3,Generated_Code_granite_3_code1,Generated_Code_granite_3_code2,Generated_Code_granite_3_code3,Generated_Code_granite_4_code1,Generated_Code_granite_4_code2,Generated_Code_granite_4_code3
0,d1,q1,"def writeBoolean(self, n):\n """"""\n ...",def writeBoolean(n):\n if not n:\n r...,"def writeBoolean(self, n):\n if not n:\n ...","def writeBoolean(self, n):\n if not n:\n ...","def writeBoolean(self, n):\n """"""\n This ...","def writeBoolean(self, n):\n """"""\n This ...","def writeBoolean(self, n):\n """"""\n This ...","def write_bool(stream, n):\n t = 'True' if ...","def write_bool(stream, n):\n t = 'True' if ...","def write_bool(stream, n):\n t = 'True' if ...","def writeBoolean(self, n):\n """"""\n Write...","def writeBoolean(self, n):\n """"""Writes a Bo...","def writeBoolean(self, n):\n """"""\n Write..."
1,d2,q2,"def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n """"""Returns system ...",def get_clipboard():\n try:\n if xse...,def get_clipboard():\n try:\n if xse...,def get_clipboard():\n try:\n if xse...,def paste(xsel=False):\n try:\n if x...,def paste(xsel=False):\n try:\n if x...,def paste(xsel=False):\n try:\n if x...
2,d3,q3,"def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n try:\n ...","def _format_json(data, theme):\n try:\n ...","def _format_json(data, theme):\n try:\n ...","def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n json_strin...","def _format_json(data, theme):\n json_strin...","def _format_json(data, theme):\n import jso...","def _format_json(data, theme):\n """"""\n P...","def _format_json(data, theme):\n """"""\n P...","def _format_json(data, theme):\n """"""\n P..."
3,d4,q4,"def create_path(path):\n """"""Creates a absol...","def create_path(path):\n os.makedirs(path, ...","def create_path(path):\n os.makedirs(path, ...","def create_path(path):\n os.makedirs(path, ...","def create_path(path):\n """"""\n This func...","def create_path(path):\n """"""\n This func...","def create_path(path):\n """"""\n This func...",def create_directory(path):\n if not os.pat...,def create_directory(path):\n if not os.pat...,def create_directory(path):\n if not os.pat...,"def create_path(path):\n """"""\n Creates a...","def create_path(path):\n """"""\n Creates a...","def create_path(path):\n """"""\n This func..."
4,d5,q5,"def _vector_or_scalar(x, type='row'):\n """"""...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def convert_and_reshape(input_data, vector_typ...","def convert_and_reshape(input_data, type='row'...","def convert_and_reshape(input_data, type='row'...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146,d5147,q5147,"def is_symlink(self):\n """"""\n Wh...",def is_symlink(path):\n try:\n st = ...,def is_symlink(path):\n try:\n stat ...,def is_symlink(path):\n try:\n retur...,"def is_symlink(self):\n """"""\n This funct...","def is_symlink(self):\n """"""\n This funct...","def is_symlink(self):\n """"""\n This funct...",def is_symlink(path):\n if not os.path.exis...,def is_symlink(path):\n try:\n info ...,def is_symlink(path):\n if not os.path.exis...,"def is_symlink(self):\n """"""...""""""\n try:...","def is_symlink(self):\n """"""\n ...\n ""...","def is_symlink(self):\n """"""...""""""\n try:..."
5147,d5148,q5148,"def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedir(path, mode=0o777, exist_ok=False):...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)...","def makedirs(path, mode=0o777, exist_ok=False)..."
5148,d5149,q5149,"def is_json_file(filename, show_warnings = Fal...","def is_json_file(file_path, file_type=""json"", ...","def load_config(file_path, file_type, show_war...","def is_json_file(file_path, file_type=""json"", ...","def is_json_file(filename, show_warnings = Fal...","def is_json_file(filename, show_warnings = Fal...","def is_json_file(filename, show_warnings = Fal...","def is_valid_json_file(file_path, show_warning...","def is_valid_json_file(file_path, show_warning...",def load_config(file_path):\n # Implementat...,"def is_json_file(filename, show_warnings):\n ...","def is_json_file(filename, show_warnings=False...","def is_json_file(filename, show_warnings=False..."
5149,d5150,q5150,"async def _thread_coro(self, *args):\n ...",class MyClass:\n async def _thread_coro(sel...,"def MapAsync(loop, executor, func, *args, **kw...","loop.run_in_executor(executor, func, *args, **...","_loop.run_in_executor(self._executor, self._fu...","_loop.run_in_executor(self._executor, self._fu...","_loop.run_in_executor(self._executor, self._fu...",class MyClass:\n def __init__(self):\n ...,class MyClass:\n def __init__(self):\n ...,class MyClass:\n def __init__(self):\n ...,"_loop.run_in_executor(self._executor, self._fu...","_loop.run_in_executor(self._executor, self._fu...","_loop.run_in_executor(self._executor, self._fu..."


In [5]:
import pandas as pd
t=pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/codebleusample.csv")

In [6]:
t.head()

Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1_code1,Generated_Code_deepseek_1_code2,Generated_Code_deepseek_1_code3,Generated_Code_deepseek_2_code1,Generated_Code_deepseek_2_code2,Generated_Code_deepseek_2_code3,Generated_Code_deepseek_3_code1,...,CodeBLEU_Score_deepseek_2_code2,CodeBLEU_Score_deepseek_2_code3,CodeBLEU_Score_deepseek_3_code1,CodeBLEU_Score_deepseek_3_code2,CodeBLEU_Score_deepseek_3_code3,CodeBLEU_Score_deepseek_4_code1,CodeBLEU_Score_deepseek_4_code2,CodeBLEU_Score_deepseek_4_code3,RTC_deepseek_CodeBLEU_Score,Pass@1_deepseek_CodeBLEU_Score
0,d1,q1,"def writeBoolean(self, n):\n """"""\n ...",def flip_stream(condition):\n # code,def f(n):\n # ... code ...,,"def writeBoolean(self, n):\n t = TYPE_BOOL","def writeBoolean(self, n):\n t = TYPE_BOOL_...","def writeBoolean(self, n):\n t = TYPE_BOOL_...",def writeBoolean(n):\n t = TYPE_BOOL_TRUE,...,0.507716,0.553623,0.050869,0.088725,0.113273,0.3736,0.0,0.0,0.147562,0.0
1,d2,q2,"def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n selection = ""prima...",,"def paste(xsel=False):\n\n """"""Pastes data f...",def paste(xsel=None):\n # Line 2: set selec...,def paste(xsel=None):\n # ... code here ...,def paste(xsel=None):\n if xsel is None:\n ...,def paste(xsel=None):\n if xsel is not None...,...,0.059629,0.363184,0.092915,0.192409,0.057818,0.066185,0.070713,0.057818,0.142211,0.0


# Explanations 4 code 1 Eval

In [7]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/deepseek_codebert_metrics_results.csv")
df["RTC_deepseek_CodeBERT_Score"].mean(), df["Pass@1_deepseek_CodeBERT_Score"].mean()

(0.7988853255083227, 0.4788875946418171)

In [8]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/granite_codebert_metrics_results.csv")
df["RTC_granite_CodeBERT_Score"].mean(), df["Pass@1_granite_CodeBERT_Score"].mean()

(0.8181990188456257, 0.5276645311589983)

In [10]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/deepseek_codebleu_metrics_results.csv")
df["RTC_deepseek_CodeBLEU_Score"].mean(), df["Pass@1_deepseek_CodeBLEU_Score"].mean()

(0.19067342299863155, 0.002899922345175694)

In [11]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/granite_codebleu_metrics_results.csv")
df["RTC_granite_CodeBLEU_Score"].mean(), df["Pass@1_granite_CodeBLEU_Score"].mean()

(0.2808076462208245, 0.0005217433508056688)

In [12]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/deepseek_struct_metrics_results.csv")
df["RTC_deepseek_Struct_Score"].mean(), df["Pass@1_deepseek_Struct_Score"].mean()

(0.5533682343234323, 0.08770141720054359)

In [13]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_1/metrics/granite_struct_metrics_results.csv")
df["RTC_granite_Struct_Score"].mean(), df["Pass@1_granite_Struct_Score"].mean()

(0.5454818190642593, 0.08141622985827994)

# Explanations 4 Code 3 Eval

In [1]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_codebert_metrics_results_dir/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_deepseek_CodeBERT_Score"].mean(), combined_df["Pass@1_deepseek_CodeBERT_Score"].mean()

(0.6938504823207198, 0.4252612761276128)

In [5]:
test = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/cleanedsample.csv")
test["Original_Code"][0], test["Original_Code_comments"][0]

('def writeBoolean(self, n):\n        t = TYPE_BOOL_TRUE\n        if n is False:\n            t = TYPE_BOOL_FALSE\n        self.stream.write(t)',
 '        """\n        Writes a Boolean to the stream.\n        """')

In [5]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_codebert_metrics_results_dir/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_granite_CodeBERT_Score"].mean(), combined_df["Pass@1_granite_CodeBERT_Score"].mean()

(0.8446780312333867, 0.7549100498285123)

In [19]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_codebleu_metrics_results.csv")
df["RTC_deepseek_CodeBLEU_Score"].mean(), df["Pass@1_deepseek_CodeBLEU_Score"].mean()

(0.1762703079252538, 0.01751682521193296)

In [20]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_codebleu_metrics_results.csv")
df["RTC_granite_CodeBLEU_Score"].mean(), df["Pass@1_granite_CodeBLEU_Score"].mean()

(0.3679706074539154, 0.12515369183977226)

In [21]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_struct_metrics_results.csv")
df["RTC_deepseek_Struct_Score"].mean(), df["Pass@1_deepseek_Struct_Score"].mean()

(0.5034404193360513, 0.2888595612502427)

In [22]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_struct_metrics_results.csv")
df["RTC_granite_Struct_Score"].mean(), df["Pass@1_granite_Struct_Score"].mean()

(0.7545834595224229, 0.5597583721607454)

# Explanations 4 Code 3 - Removed Comments from Code

In [24]:
import pandas as pd
df = pd.read_csv("/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/codebertsample.csv")
df

Unnamed: 0,corpus_id,query_id,Original_Code,Generated_Code_deepseek_1_code1,Generated_Code_deepseek_1_code2,Generated_Code_deepseek_1_code3,Generated_Code_deepseek_2_code1,Generated_Code_deepseek_2_code2,Generated_Code_deepseek_2_code3,Generated_Code_deepseek_3_code1,...,CodeBERT_Score_deepseek_3_code_3,Exact_Match_deepseek_3_code_3,CodeBERT_Score_deepseek_4_code_1,Exact_Match_deepseek_4_code_1,CodeBERT_Score_deepseek_4_code_2,Exact_Match_deepseek_4_code_2,CodeBERT_Score_deepseek_4_code_3,Exact_Match_deepseek_4_code_3,RTC_deepseek_CodeBERT_Score,Pass@1_deepseek_CodeBERT_Score
0,d1,q1,"def writeBoolean(self, n):\n """"""\n ...",def flip_stream(condition):\n # code,def f(n):\n # ... code ...,,"def writeBoolean(self, n):\n t = TYPE_BOOL","def writeBoolean(self, n):\n t = TYPE_BOOL_...","def writeBoolean(self, n):\n t = TYPE_BOOL_...",def writeBoolean(n):\n t = TYPE_BOOL_TRUE,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
1,d2,q2,"def paste(xsel=False):\n """"""Returns system ...","def paste(xsel=False):\n selection = ""prima...",,"def paste(xsel=False):\n\n """"""Pastes data f...",def paste(xsel=None):\n # Line 2: set selec...,def paste(xsel=None):\n # ... code here ...,def paste(xsel=None):\n if xsel is None:\n ...,def paste(xsel=None):\n if xsel is not None...,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
2,d3,q3,"def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n return jso...","def format_json(data, theme):\n # ... code ...","def _format_json(data, theme):\n # Code\n ...","def _format_json(data, theme):\n output = j...","pygments.highlight(output, JsonLexer()","def _format_json(data, theme):\n """"""Pretty ...","def _format_json(data, theme):\n try:\n ...",...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
3,d4,q4,"def create_path(path):\n """"""Creates a absol...",,"def f(a, b, c, d, e, f, g, h, i, j, k, l, m, n...",def absolute_path(relative_path):\n # code ...,def create_path(path):\n if not os.path.exi...,def create_path(path):\n if os.path.isabs(p...,def create_path(path):\n # Check if the pat...,def file_path(file_path):\n # ... code ...\...,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
4,d5,q5,"def _vector_or_scalar(x, type='row'):\n """"""...",def check_array(x):\n # code here\n retu...,np.array(x),def array_to_column_vector(x):\n # Check if...,"def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n if ...","def _vector_or_scalar(x, type='row'):\n # c...",np.column_stack(),...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
5,d6,q6,"def experiment_property(prop):\n """"""Get a p...","def get_property(obj, prop):\n # code here\...","def get_property(obj, property_name):\n # ....","def get_property(object, property_name):\n ...",def experiment_property(prop):\n exp = expe...,"def experiment_property(prop, exp=None):\n ...","def experiment_property(prop, session):\n e...",def experiment_property(prop):\n # code\n ...,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
6,d7,q7,"def data_from_file(file):\n """"""Return (firs...",def extract_and_interleave_wav_file(wav_file):...,,,struct.unpack(),struct.unpack(),fp.getnframes(),,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
7,d8,q8,"def source_range(start, end, nr_var_dict):\n ...","def process_range(start, end, source_range_tup...","def f(k, e):\n # ... code here ...\n # ....",,,"def source_range(start, end, nr_var_dict):\n ...","def source_range(start, end, nr_var_dict):\n ...","def source_range(start, end, nr_var_dict):\n ...",...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
8,d9,q9,"def timespan(start_time):\n """"""Return time ...","def timespan(start_time, current_time):\n r...",def timespan(start_time):\n # ... code ...,"def timespan(start_time, current_time):\n #...","def timespan(start_time):\n """"""Return time ...",timespan.total_seconds(),"def timespan(start_time):\n """"""""""""\n tim...",def timespan(start_time):\n current_time = ...,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0
9,d10,q10,"def _convert_to_array(array_like, dtype):\n ...",def convert_to_char_array(array_like_or_buffer...,def f(arr):\n # code here\n # no return ...,,"def _convert_to_array(array_like, dtype):\n ...","def _convert_to_array(array_like, dtype):\n ...","def _convert_to_array(array_like, dtype):\n ...",def convert_to_char(arr):\n if isinstance(a...,...,0.0,False,0.0,False,0.0,False,0.0,False,0.0,0.0


In [2]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_codebleu_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_deepseek_CodeBLEU_Score"].mean(), combined_df["Pass@1_deepseek_CodeBLEU_Score"].mean()

(0.23075010677548258, 0.08677926616191033)

In [3]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_codebleu_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_granite_CodeBLEU_Score"].mean(), combined_df["Pass@1_granite_CodeBLEU_Score"].mean()

(0.43083045557768157, 0.23423849737914967)

In [4]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_struct_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_deepseek_Struct_Score"].mean(), combined_df["Pass@1_deepseek_Struct_Score"].mean()

(0.5008180887206367, 0.2964793438167346)

In [5]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_struct_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_granite_Struct_Score"].mean(), combined_df["Pass@1_granite_Struct_Score"].mean()

(0.720783357600466, 0.5717587458745874)

In [10]:
import pandas as pd
import os
final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/granite_codebert_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_granite_CodeBERT_Score"].mean(), combined_df["Pass@1_granite_CodeBERT_Score"].mean()

(0.79225653401665, 0.7413393300114326)

In [9]:
import pandas as pd
import os

final=[]
folder = "/work/pi_wenlongzhao_umass_edu/27/janet/validation_tool/RTC/results/explanations_4_codes_3/metrics/deepseek_codebert_metrics_results_dir2/"
for file in os.listdir(folder):
    df = pd.read_csv(folder+file)
    final.append(df)
combined_df = pd.concat(final, ignore_index=True)

combined_df["RTC_deepseek_CodeBERT_Score"].mean(), combined_df["Pass@1_deepseek_CodeBERT_Score"].mean()

(0.700403080274779, 0.48196143143726144)