In [1]:
from datasets import load_dataset
import random
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm
import evaluate

# Loading Dataset

The dataset can be found at https://huggingface.co/datasets/code-search-net/code_search_net

In [2]:
dataset = load_dataset('code-search-net/code_search_net')

Here are the columns in the dataset. We will be using `func_documentation_string` for documentation and `func_code_string` for code.

In [12]:
train_df = pd.DataFrame(dataset['train'])

In [13]:
train_df.columns

Index(['repository_name', 'func_path_in_repository', 'func_name',
       'whole_func_string', 'language', 'func_code_string', 'func_code_tokens',
       'func_documentation_string', 'func_documentation_tokens', 'split_name',
       'func_code_url'],
      dtype='object')

In [15]:
self_training_sample = train_df.groupby('language', group_keys=False).sample(n=20)
self_training_sample.to_excel('self_training_sample.xlsx')

Getting a random sample from the test split to test our baseline model. We will increase the size once we finalize the experimental plan

In [5]:
dataset_sample = random.choices(dataset['test'], k=50)

# Testing the Baseline model

Here we define the system prompt for the Llama 2 model.

In [6]:
prompt = \
'''You are a helpful agent designed to simplify code documentation for beginner programmers.
You will be provided with a block of code and the existing doucmentation that accompanies it.
Simplify the given documentation, using the provided code as context, so that it is understandable
to beginner programmers. Output absolutely nothing else besides the simplified documentation.
Make sure to keep any documentation formatting codes present in the simplified documentation.
If you feel that the existing documentation is simple enough and meaning would be lost by simplifying
it further, feel free to keep the documentation as is. Here is the original documentation and code:'''

Creating the pipeline for the Llama 2 model using the HuggingFace transformers library. Modified from the example here: https://huggingface.co/docs/transformers/en/model_doc/llama2

In [7]:
pipe = pipeline(
    task="text-generation",
    model="meta-llama/Llama-2-7b-chat-hf",
    torch_dtype=torch.float16,
    device=1
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Testing the pipeline. Modified from examples given here: https://huggingface.co/docs/transformers/main/en/chat_templating

In [8]:
message = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": f"Documentation:\n{dataset['test'][0]['func_documentation_string']}\n\nCode:\n{dataset['test'][0]['func_code_string']}"}
]
print(f"Original Documentation:\n{dataset['test'][0]['func_documentation_string']}\n")
print(f"Code:\n{dataset['test'][0]['func_code_string']}\n")
print(pipe(message, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text'][-1]['content'])

Original Documentation:
Extracts video ID from URL.

Code:
def get_vid_from_url(url):
        """Extracts video ID from URL.
        """
        return match1(url, r'youtu\.be/([^?/]+)') or \
          match1(url, r'youtube\.com/embed/([^/?]+)') or \
          match1(url, r'youtube\.com/v/([^/?]+)') or \
          match1(url, r'youtube\.com/watch/([^/?]+)') or \
          parse_query_param(url, 'v') or \
          parse_query_param(parse_query_param(url, 'u'), 'v')

  Simplified documentation:

This function extracts the video ID from a URL. It uses regular expressions to search for the video ID in various parts of the URL, including the domain name, query parameters, and the URL path. If the video ID is not found in any of these places, it falls back on parsing the query parameters or the URL path of the parent URL.

Here are the specific patterns used by the function:

* `youtu.be/([^?/]+)`: Matches the URL prefix "youtu.be" followed by any characters that are not a question mark or 

Loading the evaluation model used for computing semantic similarity. Taken from example here: https://huggingface.co/tasks/sentence-similarity

In [None]:
eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Running inference on the dataset sample

In [None]:
semantic_similarities = []
metrics = evaluate.combine(['rouge', 'meteor'])

for instance in tqdm(dataset_sample):
    message = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": f"Documentation:\n{instance['func_documentation_string']}\n\nCode:\n{instance['func_code_string']}"}
    ]

    result = pipe(message, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text'][-1]['content']

    embedding_original = eval_model.encode(instance['func_documentation_string'], convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    semantic_similarities.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    metrics.add(predictions=result, references=instance['func_documentation_string'])

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  2%|▏         | 1/50 [00:06<04:55,  6.02s/it]

tensor([-4.0617e-02, -2.5245e-02, -3.3898e-02,  3.4955e-02, -7.8197e-02,
        -4.5097e-02,  2.1003e-02, -3.5388e-03, -4.2873e-02, -1.6457e-02,
        -2.1768e-02, -1.0718e-01, -1.4750e-02,  2.3852e-02,  7.9610e-02,
        -2.7136e-02, -1.7002e-02,  1.4127e-02,  3.1038e-02, -4.9230e-02,
         6.4198e-02,  1.0624e-01, -8.7202e-02,  3.1809e-02, -2.4426e-02,
         3.1333e-02, -1.0073e-02,  9.0686e-03,  3.7630e-02,  1.5045e-02,
         6.4684e-02, -4.9418e-02, -1.6244e-02,  1.0994e-01,  1.9740e-02,
         6.8167e-02, -1.7478e-02,  3.1046e-02,  2.0097e-02, -6.5180e-02,
         1.8099e-02, -3.5956e-02, -5.0890e-02, -2.6844e-02, -1.0052e-02,
        -1.1673e-02,  2.1438e-02,  1.7858e-02, -8.2263e-02, -6.2854e-02,
        -5.4827e-02, -9.7541e-02, -4.2843e-02, -5.2755e-03, -3.6599e-02,
         3.7040e-03,  9.3160e-03,  1.9749e-02, -4.8249e-02, -1.1434e-03,
        -1.1182e-01, -1.8309e-03, -4.6854e-02, -1.3499e-02, -5.2136e-02,
         2.8388e-02,  2.2618e-02,  4.9271e-02,  1.9

  4%|▍         | 2/50 [00:10<04:00,  5.02s/it]

tensor([-6.0239e-02,  1.3712e-01, -6.9470e-03,  1.5242e-02, -5.1226e-03,
         1.6771e-02,  9.9646e-03,  5.7874e-02,  6.5670e-03,  1.5901e-02,
         7.4522e-02, -1.2004e-01,  6.4690e-02, -3.5782e-03,  6.9950e-02,
         8.6913e-02, -5.2090e-02,  3.0828e-02,  1.6852e-03,  4.6658e-02,
         1.1616e-01,  7.5936e-02, -6.7552e-02, -3.6469e-02,  5.4639e-02,
         3.1610e-02,  6.8702e-02,  4.3317e-02,  7.3390e-02,  1.1406e-03,
         2.5449e-02,  2.6494e-02,  6.3689e-02, -9.0704e-03,  1.1250e-02,
         8.3471e-02, -8.4939e-02, -5.8509e-02, -1.1904e-02,  1.9717e-02,
        -4.3615e-02, -7.6456e-03,  3.9391e-02, -1.4664e-02, -1.6879e-02,
        -9.7906e-03,  9.1812e-02, -7.4593e-02, -2.7284e-02, -1.5149e-03,
         1.9102e-02, -7.1336e-03, -4.3299e-02, -2.6872e-03, -6.2507e-02,
         3.3082e-02, -6.8758e-02, -7.7160e-02, -6.0820e-02, -7.4388e-02,
        -2.5024e-02,  1.1139e-02,  6.0829e-02,  9.2320e-03,  2.6255e-04,
        -2.0452e-03, -1.1974e-01, -1.2278e-02,  7.0

  4%|▍         | 2/50 [00:14<05:53,  7.36s/it]


KeyboardInterrupt: 

Summary statistics for semantic similarity results

In [None]:
pd.DataFrame(semantic_similarities).describe()

Unnamed: 0,0
count,50.0
mean,0.665009
std,0.182714
min,0.088799
25%,0.558814
50%,0.683454
75%,0.791257
max,0.926352


ROUGE and METEOR results

In [17]:
metrics.compute()

{'rouge1': 0.280522090651527,
 'rouge2': 0.19042346879502914,
 'rougeL': 0.24326669815314184,
 'rougeLsum': 0.27162905999224796,
 'meteor': 0.3745646064192065}