In [1]:
from datasets import load_dataset
import random
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm
import evaluate

# Loading Dataset

The dataset can be found at https://huggingface.co/datasets/Fsoft-AIC/the-vault-function

In [2]:
dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=["test"])

python-00000-of-00001.parquet:   0%|          | 0.00/30.3M [00:00<?, ?B/s]

c-00000-of-00001.parquet:   0%|          | 0.00/25.9M [00:00<?, ?B/s]

c_sharp-00000-of-00001.parquet:   0%|          | 0.00/18.5M [00:00<?, ?B/s]

cpp-00000-of-00001.parquet:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

go-00000-of-00001.parquet:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

java-00000-of-00001.parquet:   0%|          | 0.00/19.3M [00:00<?, ?B/s]

javascript-00000-of-00001.parquet:   0%|          | 0.00/26.4M [00:00<?, ?B/s]

php-00000-of-00001.parquet:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

ruby-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

rust-00000-of-00001.parquet:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Here are the columns in the dataset. We will be using `original_docstring` for documentation and `original_string` for code.

In [3]:
test_df = pd.DataFrame(dataset['test'])

In [4]:
test_df.columns

Index(['hexsha', 'repo', 'path', 'license', 'language', 'identifier',
       'return_type', 'original_string', 'original_docstring', 'docstring',
       'docstring_tokens', 'code', 'code_tokens', 'short_docstring',
       'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
      dtype='object')

Getting a random sample from the test split to test our baseline model. We will increase the size once we finalize the experimental plan

In [5]:
dataset_sample = test_df.groupby('language', group_keys=False).sample(n=20)

# Testing the Baseline model

Here we define the system prompt for the Llama 2 model.

In [6]:
def prompt(language):
    return \
f'''You are a helpful agent designed to simplify code documentation for beginner programmers.
You will be provided with a block of {language} code and the existing doucmentation that accompanies it.
Simplify the given documentation, using the provided code as context, so that it is understandable
to beginner programmers. Output absolutely nothing else besides the simplified documentation.
Make sure to keep any documentation formatting codes present in the simplified documentation.
If you feel that the existing documentation is simple enough and meaning would be lost by simplifying
it further, feel free to keep the documentation as is. Here is the original documentation and code:'''

Creating the pipeline for the Llama 2 model using the HuggingFace transformers library. Modified from the example here: https://huggingface.co/docs/transformers/en/model_doc/llama2

In [8]:
pipe = pipeline(
    task="text-generation",
    model="meta-llama/Llama-2-7b-chat-hf",
    torch_dtype=torch.float16,
    device=3
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:3


Testing the pipeline. Modified from examples given here: https://huggingface.co/docs/transformers/main/en/chat_templating

In [9]:
message = [
    {"role": "system", "content": prompt(dataset['test'][0]['language'])},
    {"role": "user", "content": f"Documentation:\n{dataset['test'][0]['original_docstring']}\n\nCode:\n{dataset['test'][0]['original_string']}"}
]
print(f"Original Documentation:\n{dataset['test'][0]['original_docstring']}\n")
print(f"Code:\n{dataset['test'][0]['original_string']}\n")
print(pipe(message, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text'][-1]['content'])

Original Documentation:
Build the library mappings tables.

Code:
def build_mapping_tables(app):
    """Build the library mappings tables."""
    env = Environment(loader=FileSystemLoader(f"{DIR_PATH}"))
    template_file = env.get_template("table_template.j2")

    LIST_OF_MAP_DICTS = []
    for attr in dir(lib_mapper):
        if (attr.endswith("MAPPER_REVERSE") or attr.endswith("_MAPPER")) and not (
            attr.startswith("_") or attr.startswith("NETMIKO") or attr.startswith("MAIN")
        ):
            LIST_OF_MAP_DICTS.append(attr)

    for dict_name in LIST_OF_MAP_DICTS:
        lib_name = dict_name.split("_")[0]
        filename = f"{lib_name}_reverse" if "REVERSE" in dict_name else lib_name
        headers = ["NORMALIZED", lib_name] if "REVERSE" in dict_name else [lib_name, "NORMALIZED"]
        rendered_template = template_file.render(lib_names=headers, mappings=getattr(lib_mapper, dict_name))
        with open(f"{DIR_PATH}/netutils/lib_mapping/{filename}_table.rst", "w

Loading the evaluation model used for computing semantic similarity. Taken from example here: https://huggingface.co/tasks/sentence-similarity

In [10]:
eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Running inference on the dataset sample

In [16]:
semantic_similarities = []
metrics = evaluate.combine(['rouge', 'meteor'])

for instance in tqdm(dataset_sample.itertuples()):
    message = [
        {"role": "system", "content": prompt(instance.language)},
        {"role": "user", "content": f"Documentation:\n{instance.original_docstring}\n\nCode:\n{instance.original_string}"}
    ]

    result = pipe(message, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text'][-1]['content']

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    semantic_similarities.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    metrics.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
9it [01:30, 11.89s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
200it [24:58,  7.49s/it]


Summary statistics for semantic similarity results

In [19]:
sims = pd.DataFrame(semantic_similarities)
sims.to_excel('semantic_similarities_baseline.xlsx')
sims.describe()

Unnamed: 0,0
count,200.0
mean,0.649289
std,0.161408
min,0.161233
25%,0.553146
50%,0.665742
75%,0.775409
max,0.967906


ROUGE and METEOR results

In [None]:
metrics.compute()

{'rouge1': 0.23746285790317642,
 'rouge2': 0.15850905081486888,
 'rougeL': 0.20873611382827575,
 'rougeLsum': 0.23081417919447733,
 'meteor': 0.35096813813331496}