In [4]:
import torch
import numpy
import json
from tqdm import tqdm
import os
import gc
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer


In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("CoIR-Retrieval/cosqa-queries-corpus")
print(dataset.keys())
# Convert the 'train' split to a pandas DataFrame
# df = dataset["train"].to_pandas()

# # Save the DataFrame to a CSV file (without the index)
# df.to_csv("cosqa_queries_corpus.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm


dict_keys(['corpus', 'queries'])


In [5]:
new_dataset=pd.DataFrame(dataset['corpus'])

In [2]:
dataset['corpus']

Dataset({
    features: ['_id', 'partition', 'text', 'title', 'language', 'meta_information'],
    num_rows: 20604
})

In [None]:
new_dataset = new_dataset.rename(columns={'_id':'corpus_id', 'text':'code'})


In [12]:
new_dataset.head()

Unnamed: 0,corpus_id,partition,code,title,language,meta_information
0,d1,train,"def writeBoolean(self, n):\n """"""\n ...",,PYTHON,{'dummy_field': ''}
1,d2,train,"def paste(xsel=False):\n """"""Returns system ...",,PYTHON,{'dummy_field': ''}
2,d3,train,"def _format_json(data, theme):\n """"""Pretty ...",,PYTHON,{'dummy_field': ''}
3,d4,train,"def create_path(path):\n """"""Creates a absol...",,PYTHON,{'dummy_field': ''}
4,d5,train,"def _vector_or_scalar(x, type='row'):\n """"""...",,PYTHON,{'dummy_field': ''}


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [None]:
num_gpus = torch.cuda.device_count()
print(num_gpus)

4


In [None]:
new_dataset_queries=pd.DataFrame(dataset['queries'])

          _id partition                                             text  \
0          q1     train                python code to write bool value 1   
1          q2     train             "python how to manipulate clipboard"   
2          q3     train                    python colored output to html   
3          q4     train  python "create directory" using "relative path"   
4          q5     train                        python column of an array   
...       ...       ...                                              ...   
20599  q20600      test              python how to select first 100 rows   
20600  q20601      test          removing columnsns in data frame python   
20601  q20602      test                     python array to torch tensor   
20602  q20603      test             how to turn a list into a csv python   
20603  q20604      test                    how do i unzip file in python   

      title language     meta_information  
0                     {'dummy_field': ''}  

In [13]:
new_dataset_queries = new_dataset_queries.rename(columns={'_id':'query_id', 'text':'doc'})


In [14]:
new_dataset_queries.head()

Unnamed: 0,query_id,partition,doc,title,language,meta_information
0,q1,train,python code to write bool value 1,,,{'dummy_field': ''}
1,q2,train,"""python how to manipulate clipboard""",,,{'dummy_field': ''}
2,q3,train,python colored output to html,,,{'dummy_field': ''}
3,q4,train,"python ""create directory"" using ""relative path""",,,{'dummy_field': ''}
4,q5,train,python column of an array,,,{'dummy_field': ''}


In [None]:
df_doc_code = pd.concat([new_dataset_queries[['query_id','doc']], new_dataset[['corpus_id','code']]], axis = 1)

In [None]:
df_doc_code.head()

Unnamed: 0,query_id,doc,corpus_id,code
0,q1,python code to write bool value 1,d1,"def writeBoolean(self, n):\n """"""\n ..."
1,q2,"""python how to manipulate clipboard""",d2,"def paste(xsel=False):\n """"""Returns system ..."
2,q3,python colored output to html,d3,"def _format_json(data, theme):\n """"""Pretty ..."
3,q4,"python ""create directory"" using ""relative path""",d4,"def create_path(path):\n """"""Creates a absol..."
4,q5,python column of an array,d5,"def _vector_or_scalar(x, type='row'):\n """"""..."


In [None]:
df_doc_code.to_csv("cosqa_queries_code_corpus.csv", index=False)


In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch

In [21]:
# Set environment variable to help with memory fragmentation.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# def clean_output(text):
#     if "<think>" in text:
#         cleaned = text.rsplit("<think>", 1)[-1]
#     else:
#         cleaned = text
#     return cleaned.strip()

def clean_output(text):
    cleaned = text.split("Answer:")[-1]
    return cleaned

# def clean_output(text):
#     # Split from the right using 'Answer:' and return the portion after the last occurrence.
#     if "Answer:" in text:
#         return text.rsplit("Answer:", 1)[-1].strip()
#     return text.strip()


class ExplanationGeneratorLama:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.padding_side = "left"  # Set left-padding for decoder-only models. only for Llama3
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,    # Load in half precision if supported.
            low_cpu_mem_usage=True
        )
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def generate_explanations_batch(self, entries, max_new_tokens=500):
        # Create prompts by combining each entry with each prompt template.
        
        prompts = []
        
        for entry in entries:
            prompt_templates = [
                f"Doc string: {entry['doc']}\n"
                f"Code snippet: {entry['code']}\n"
                "Instruction: Provide a concise explanation of what the above doc and code mean. "
                "Generate strictly less than 100 words in total. Please give the output just as text only. Do not return anything else.\n"
                "Answer: \n"
                , 

                f"Doc string: {entry['doc']}\n"
                f"Code snippet: {entry['code']}\n"
                "Instruction: Provide a detailed line-by-line explanation of this code snippet, describing the purpose and functionality of each statement, function, and control structure. "
                "Please give the output just as text only. Do not return anything else.\n"
                "Answer: \n"
                ,

                f"Doc string: {entry['doc']}\n"
                f"Code snippet: {entry['code']}\n"
                "Instruction: Summarize what this code snippet does in simple, non-technical language, focusing on its overall purpose and key operations for someone with little programming experience. "
                "Please give the output just as text only. Do not return anything else.\n"
                "Answer: \n"
                ,

                f"Doc string: {entry['doc']}\n"
                f"Code snippet: {entry['code']}\n"
                "Instruction: Generate an explanation of the code snippet in such a way that it can regenerate the code based on this explanation. "
                "Please give the output just as text only. Do not return anything else.\n"
                "Answer: \n"
            ]
            
            for template in prompt_templates:
                prompt = (
                    f"Doc string: {entry['doc']}\n"
                    f"Code snippet: {entry['code']}\n"
                    f"{template}\n"
                    "Answer: \n"
                )
                prompts.append(prompt)
                
        # Tokenize all prompts at once.
        inputs = self.tokenizer(
            prompts, 
            return_tensors="pt", 
            truncation=True, 
            max_length=2000,
            padding=True
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate outputs for all prompts.
        with torch.no_grad():
            outputs = self.model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=True,
                num_return_sequences=1,  # One output per prompt variation.
            )
            
        # Decode the outputs.
        explanations = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        # Regroup explanations by entry.
        n_prompts = len(prompt_templates)
        grouped_explanations = []
        for i in range(0, len(explanations), n_prompts):
            grouped_explanations.append(explanations[i:i+n_prompts])
            
        return grouped_explanations


In [5]:
# # Example code snippet
# doc= "Python : Add Two Numbers"
# code_snippet = "def add(a, b): return a + b"

# # Create a prompt instructing the model to explain the code
# prompt = f"Explain what the following Python code does:\n\n{code_snippet}\n\nExplanation:"

# # Generate 5 different explanations
# # explanations = generator(prompt, max_length=150, num_return_sequences=5, do_sample=True, temperature=0.7)
# # explanations=generate_explanation(doc,code_snippet,1)

# # Print out the explanations

# print(explanations)


In [None]:
# df = pd.read_csv('/work/pi_wenlongzhao_umass_edu/27/vaishnavisha/datasets/CoSQA_combined.csv')

In [None]:
# df.shape

(19604, 13)

In [None]:


if __name__ == "__main__":
    json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CodeXGLUE/Text-Code/NL-code-search-WebQuery/CoSQA/cosqa-dev.json"
    output_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_llama3.json"
    
    with open(json_path, "r") as f:
        data = json.load(f)
    print('Data loaded')
    
    filtered_entries = [
        {"doc": entry.get("doc", ""), "code": entry.get("code", ""), "label": entry.get("label")}
        for entry in data if entry.get("label") == 1
    ]
    
    for entry in filtered_entries:
        entry["explanations"] = {}
    
    models_dict = {
        # 'llama3-instruct': "/datasets/ai/llama3/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6"
        # 'llama2': "/datasets/ai/llama2/hub/llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590",
        # "deepseek": "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa",
        # "gemma": "/datasets/ai/gemma/hub/models--google--gemma-7b-it/snapshots/8adab6a35fdbcdae0ae41ab1f711b1bc8d05727e"
        # "llama3": "/datasets/ai/llama3/hub/models--meta-llama--Llama-3.2-3B/snapshots/13afe5124825b4f3751f836b40dafda64c1ed062"
    }
    
    batch_size = 20
    
    for model_key, model_path in tqdm(models_dict.items(), desc="Processing models"):
        print(f"\nProcessing model {model_key}...")
        generator = ExplanationGeneratorLama(model_path)
        for i in tqdm(range(0, len(filtered_entries), batch_size), desc="Processing batches"):
            batch_entries = filtered_entries[i:i+batch_size]
            batch_explanations = generator.generate_explanations_batch(batch_entries)
            for j, entry in enumerate(batch_entries):
                raw_text = batch_explanations[j].strip()
                parts = raw_text.split("Answer: ")
                if len(parts) > 1:
                    explanation_text = parts[1].strip()
                else:
                    explanation_text = raw_text
                entry["explanations"][model_key] = explanation_text
            torch.cuda.empty_cache()
            gc.collect()
        del generator
        torch.cuda.empty_cache()
        gc.collect()
    
    with open(output_json_path, "w") as f:
        json.dump(filtered_entries, f, indent=4)
    
    print(f"\nExplanations from all models have been saved to {output_json_path}")

Data loaded


Processing models:   0%|          | 0/1 [00:00<?, ?it/s]


Processing model llama3...



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s][A
Loading checkpoint shards:  50%|█████     | 1/2 [00:09<00:09,  9.35s/it][A
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.18s/it][A

Processing batches:   0%|          | 0/79 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing batches:   1%|▏         | 1/79 [00:05<07:02,  5.42s/it][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing batches:   3%|▎         | 2/79 [00:10<06:50,  5.33s/it][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing batches:   4%|▍         | 3/79 [00:13<05:33,  4.39s/it][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing batches:   5%|▌         | 4/79 [00:19<06:14,  4.99s/it][ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

Processing batches:   6%|▋         | 5/79 [00:22<05:08,  4.17s/it][ASe


Explanations from all models have been saved to /work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_llama3.json





In [22]:
import pandas as pd
import torch, gc
from tqdm import tqdm
# Import or define your ExplanationGeneratorLama class here

if __name__ == "__main__":
    csv_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa_queries_code_corpus.csv"  # change this to your CSV input path
    output_csv_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/CoSQA_granite_explanations_tmp1.csv"
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_path).head(5)
    print("Data loaded from CSV")
    
    # If your CSV has a "label" column and you only want to process rows where label == 1, uncomment:
    # if "label" in df.columns:
    #     df = df[df["label"] == 1].reset_index(drop=True)
    
    # Define your model(s) in a dictionary.
    models_dict = {
        "deepseek": "/datasets/ai/deepseek/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B/snapshots/530ca3e1ad39d440e182c2e4317aa40f012512fa",
        # 'granite': "/datasets/ai/ibm-granite/hub/models--ibm-granite--granite-3.0-2b-instruct/snapshots/69e41fe735f54cec1792de2ac4f124b6cc84638f"

    }
    
    # Create empty columns for each model's explanation in the DataFrame
    for model_key in models_dict.keys():
        df[f'explanation_{model_key}'] = ""
    
    batch_size = 4  # If memory issues persist, try reducing this value further.
    
    for model_key, model_path in tqdm(models_dict.items(), desc="Processing models"):
        print(f"\nProcessing model {model_key}...")
        generator = ExplanationGeneratorLama(model_path)
        # Set the model to evaluation mode to disable dropout and other training-specific layers.
        if hasattr(generator, 'model'):
            generator.model.eval()
        
        # Process the DataFrame in batches
        for i in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
            # Create a batch of entries (each is a dict with "doc" and "code")
            batch_entries = df.iloc[i:i+batch_size][["corpus_id", "query_id", "doc", "code"]].to_dict("records")
            
            # Wrap inference in a no_grad context to prevent gradient computations.
            with torch.no_grad():
                batch_explanations = generator.generate_explanations_batch(batch_entries)
            
            for j, explanation_variants in enumerate(batch_explanations):
                print(type(batch_explanations))
                for idx, raw_text in enumerate(explanation_variants):
                    # print(raw_text)
                    # Clean the output using the clean_output function.
                    cleaned_text = raw_text.split("Answer:\n")[-1]
                    # cleaned_text = clean_output(raw_text.strip())
                    # Optionally remove any "Answer: " prefix.
                    # parts = cleaned_text.split("Answer:\n\n")
                    # if len(parts) > 1:
                    #     explanation_text = parts[1].strip()
                    # else:
                    #     explanation_text = cleaned_text
                    # print(cleaned_text)
                    # Save each explanation variant into its designated column.
                    df.loc[i+j, f'explanation_{model_key}_{idx+1}'] = raw_text
            
            torch.cuda.empty_cache()
            gc.collect()
        
        del generator
        torch.cuda.empty_cache()
        gc.collect()
    
    # Save the DataFrame with the new explanation columns to a CSV file
    # df.to_csv(output_csv_path, index=False)
    # print(f"\nExplanations from all models have been saved to {output_csv_path}")


Data loaded from CSV


Processing models:   0%|          | 0/1 [00:00<?, ?it/s]


Processing model deepseek...


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Processing batches:   0%|          | 0/2 [06:13<?, ?it/s]
Processing models:   0%|          | 0/1 [09:43<?, ?it/s]


KeyboardInterrupt: 

In [None]:
df.head()

In [13]:
df['cleaned_exp_deepseek'] = df['explanation_deepseek'].apply(clean_output)

In [14]:
df['cleaned_exp_deepseek'].unique()

array(['\n\nThe doc and code are related to writing a boolean value (1) to a stream.',
       '\n\nThe code pastes system clipboard contents into memory, allowing selection from either primary or clipboard.',
       '\n\nThe code snippet is a function `_format_json` that takes `data` and `theme` as arguments. It converts `data` into JSON format using `json.dumps` with `indent=2` and `sort_keys=True`. If `pygments` is enabled and `sys.stdout.isatty()` returns `True`, it uses the `Terminal256Formatter` from `pygments` to highlight the JSON output with a terminal style. Otherwise, it returns the raw JSON output.',
       '\n\nThe doc explains creating an absolute path from a relative path, and the code snippet shows that the function checks if the path exists and creates it if not.',
       "\n\nThe doc and code are about converting an array or list into a scalar, row vector, or column vector. The code checks the type of `x` and converts it accordingly. If `x` is a list or tuple, it's con

In [11]:

#Merging all the explanations
json_path_model1 = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_deepseek.json"  # e.g., explanations from llama3
json_path_model2 = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_llama2.json"  # e.g., explanations from deepseek
json_path_model3 = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_gemma.json"  # e.g., explanations from gemma
json_path_model4 = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_llama3-instruct.json"
json_path_model5 = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_explanations_llama3.json"

output_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_merged_explanations.json"

with open(json_path_model1, "r") as f:
    data1 = json.load(f)
with open(json_path_model2, "r") as f:
    data2 = json.load(f)
with open(json_path_model3, "r") as f:
    data3 = json.load(f)
with open(json_path_model4, "r") as f:
    data4 = json.load(f)
with open(json_path_model5, "r") as f:
    data5 = json.load(f)


merged_entries = []

for entry1, entry2, entry3,entry4,entry5 in zip(data1, data2, data3, data4 ,data5):
    merged_entry = {
        "doc": entry1["doc"],
        "code": entry1["code"],
        "explanations": {}
    }
    merged_entry["explanations"].update(entry1.get("explanations", {}))
    merged_entry["explanations"].update(entry2.get("explanations", {}))
    merged_entry["explanations"].update(entry3.get("explanations", {}))
    merged_entry["explanations"].update(entry4.get("explanations", {}))
    merged_entry["explanations"].update(entry4.get("explanations", {}))
    merged_entry["explanations"].update(entry5.get("explanations", {}))


    
    merged_entries.append(merged_entry)

with open(output_json_path, "w") as f:
    json.dump(merged_entries, f, indent=4)

print("Merged JSON saved to", output_json_path)


Merged JSON saved to /work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_merged_explanations.json


In [12]:
import os
import json
import torch
import gc
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


# Initialize model name (using CodeBERT as an example encoder)
# model_name = "/datasets/ai/llama3/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6"
# model_name = "microsoft/codebert-base"


# Load the tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

## Llama2
# def get_embeddings(text, max_length=512):
#     """
#     Given a text string, returns a fixed-size embedding by mean-pooling
#     the encoder's output tokens.
#     """
#     inputs = tokenizer(
#         text,
#         return_tensors="pt",
#         truncation=True,
#         padding=True,
#         max_length=max_length
#     )
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     with torch.no_grad():
#         outputs = model(**inputs)  # outputs.last_hidden_state: (batch, seq_length, hidden_size)
#     embedding = outputs.last_hidden_state.mean(dim=1)  # shape: (batch, hidden_size)
#     return embedding.squeeze().cpu().numpy().tolist()
def get_cls_embedding(text, max_length=512):
    """
    Given a text string, returns the CLS token embedding as a list.
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the first token's embedding (commonly the [CLS] token).
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().cpu().numpy().tolist()

# Paths to the merged JSON file.
merged_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_merged_explanations.json"
output_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_embeddings_explanations_.json"

with open(merged_json_path, "r") as f:
    data = json.load(f)

# For each entry, compute embeddings for the concatenation of doc, code, and explanation.
# The resulting embedding is stored for each model key.
for entry in tqdm(data, desc="Computing combined embeddings"):
    entry["embeddings"] = {}
    doc = entry.get("doc", "").strip()
    code = entry.get("code", "").strip()
    explanations = entry.get("explanations", {})
    for model_key, explanation in explanations.items():
        explanation = explanation.strip()
        # Combine doc, code, and explanation into one text.
        combined_text = f"Doc: {doc}\nCode: {code}\nExplanation: {explanation}"
        if not combined_text.strip():
            entry["embeddings"][model_key] = []
        else:
            embedding = get_cls_embedding(combined_text)
            entry["embeddings"][model_key] = embedding

with open(output_json_path, "w") as f:
    json.dump(data, f, indent=4)

torch.cuda.empty_cache()
gc.collect()

print("Combined embeddings have been saved to", output_json_path)


NameError: name 'model_name' is not defined

In [13]:


encoder = SentenceTransformer("all-MiniLM-L6-v2")

def get_minilm_embedding(combined_text: str) -> list:
    
    embedding = encoder.encode(combined_text, show_progress_bar=False)
    return embedding.tolist()

merged_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_merged_explanations.json"
output_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_embeddings_explanations_.json"

with open(merged_json_path, "r") as f:
    data = json.load(f)

for entry in tqdm(data, desc="Computing combined embeddings"):
    entry["embeddings"] = {}
    doc = entry.get("doc", "").strip()
    code = entry.get("code", "").strip()
    explanations = entry.get("explanations", {})
    for model_key, explanation in explanations.items():
        explanation = explanation.strip()
        combined_text = f"Doc: {doc}\nCode: {code}\nExplanation: {explanation}"
        if not combined_text.strip():
            entry["embeddings"][model_key] = []
        else:
            embedding = get_minilm_embedding(combined_text)
            entry["embeddings"][model_key] = embedding

with open(output_json_path, "w") as f:
    json.dump(data, f, indent=4)

torch.cuda.empty_cache()
gc.collect()

print("Merged explanations with embeddings have been saved to", output_json_path)


Computing combined embeddings: 100%|██████████| 313/313 [00:05<00:00, 55.15it/s]


Merged explanations with embeddings have been saved to /work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_embeddings_explanations_.json


In [23]:
# model_name = "sentence-transformers/LaBSE"
# encoder = SentenceTransformer(model_name)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# encoder.to(device)

# def get_sbert_embedding(text):

#     embedding = encoder.encode(text, convert_to_tensor=True, device=device)
#     return embedding.cpu().numpy().tolist()


In [30]:

### getting the similarity score for each embeddings

def cosine_similarity(a, b):
    """
    Compute cosine similarity between two vectors.
    """
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8))


input_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/cosqa-dev_embeddings_explanations_.json"  # update this path
# output_json_path = "/work/pi_wenlongzhao_umass_edu/27/anamikaghosh/similarity_scores.json"                   # update this path

with open(input_json_path, "r") as f:
    data = json.load(f)
print("Merged explanations loaded.")

###### questions ####
query = "How to round a float to int"  
query_embedding = get_minilm_embedding(query)


results = pd.DataFrame(columns=['doc','code','best_explanation','max_similarity','best_model'])
for entry in tqdm(data, desc="Processing entries"):
    max_sim = -1.0
    best_model = None
    embeddings = entry.get("embeddings", {})
    for model_key, emb in embeddings.items():
        if emb:  # Ensure the embedding is not empty
            emb_np = np.array(emb)
            sim = cosine_similarity(query_embedding, emb_np)
            if sim > max_sim:
                max_sim = sim
                best_model = model_key
    new_row = {
        "doc": entry.get("doc", ""),
        "code": entry.get("code", ""),
        "best_explanation": entry.get("explanations", {}).get(best_model, ""),
        "max_similarity": max_sim,
        "best_model": best_model
    }
    
    results.loc[len(results)] = new_row



# df = pd.DataFrame(results)
# print("DataFrame created:")
# print(df)


Merged explanations loaded.


Processing entries: 100%|██████████| 313/313 [00:00<00:00, 1422.47it/s]


In [31]:
results

Unnamed: 0,doc,code,best_explanation,max_similarity,best_model
0,python split strings into list of lines,"def split_multiline(value):\n """"""Split a mu...",</think>\n\nThe function `split_multiline` tak...,0.021005,deepseek
1,loading a series of images in python and resiz...,def load_preprocess_images(image_paths: List[s...,The code loads a list of images specified with...,0.019301,llama2
2,python save graph into file,"def to_dotfile(G: nx.DiGraph, filename: str):\...",The code defines a function called `to_dotfile...,-0.031910,gemma
3,add color to print python,"def write_color(string, name, style='normal', ...",The code defines a function called `write_colo...,-0.027941,gemma
4,python limit number to two decimals,"def truncate(value: Decimal, n_digits: int) ->...",This function truncates a decimal value to a m...,0.424926,llama2
...,...,...,...,...,...
308,get window title in python,"def title(self):\n """""" The title of thi...",The doc string explains that the title of a wi...,-0.042896,llama3
309,if matches a set of strings python,"def any_contains_any(strings, candidates):\n ...",The docstring defines a function that checks i...,-0.023504,llama3-instruct
310,python detect type of namedtuple,"def isnamedtuple(obj):\n """"""Heuristic check...",The code defines a function `isnamedtuple` tha...,0.030626,gemma
311,how to define a empty data frame in python,"def add_blank_row(self, label):\n """"""\n...",This code snippet defines a function called `a...,0.034638,llama3


In [32]:
results.describe()

Unnamed: 0,max_similarity
count,313.0
mean,0.073342
std,0.111136
min,-0.097918
25%,0.000571
50%,0.046716
75%,0.127472
max,0.663343


In [33]:
results.sort_values(by=['max_similarity'], ascending=False)

Unnamed: 0,doc,code,best_explanation,max_similarity,best_model
164,python 3 rounding or floats,"def py3round(number):\n """"""Unified rounding...",The code defines a function `py3round` that un...,0.663343,gemma
86,python round to three significant digits,"def round_sig(x, sig):\n """"""Round the numbe...",The doc string provides a description of the f...,0.580077,llama3-instruct
267,f strings number rounding formatting python,"def _saferound(value, decimal_places):\n """"...",_saferound is a function to round a float valu...,0.525706,llama2
254,how to get a random float between 0 and 1 in p...,"def money(min=0, max=10):\n """"""Return a str...",</think>\n\nTo generate a random float between...,0.467388,deepseek
117,generates a random decimal number between 1 an...,"def money(min=0, max=10):\n """"""Return a str...",Doc string: generates a random decimal number ...,0.439706,llama2
...,...,...,...,...,...
148,check if a string start with a prefix in python,"def starts_with_prefix_in_list(text, prefixes)...",The code checks if a given text starts with an...,-0.071353,deepseek
227,checking what linux distibution is being used ...,"def is_archlinux():\n """"""return True if the...",The above code is used to determine whether th...,-0.084472,llama3
135,python function to remove all non english letters,"def clean(self, text):\n """"""Remove all ...",The docstring explains the purpose of the func...,-0.085364,llama3-instruct
301,escaping characters for non printable characte...,"def _escape(s):\n """""" Helper method that es...",The code defines a helper function _escape tha...,-0.089189,gemma
