In [1]:
from datasets import load_dataset
import random
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
#from tqdm import tqdm
from tqdm.notebook import tqdm
import evaluate

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
#os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [None]:
#dataset = load_dataset('code-search-net/code_search_net')
dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=["test"], trust_remote_code=True)

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
sample_df = pd.DataFrame(dataset['test'])
sample_df.columns

Index(['hexsha', 'repo', 'path', 'license', 'language', 'identifier',
       'return_type', 'original_string', 'original_docstring', 'docstring',
       'docstring_tokens', 'code', 'code_tokens', 'short_docstring',
       'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
      dtype='object')

In [5]:
dataset_sample = sample_df.groupby('language', group_keys=False).sample(n=20)
dataset_sample.columns

Index(['hexsha', 'repo', 'path', 'license', 'language', 'identifier',
       'return_type', 'original_string', 'original_docstring', 'docstring',
       'docstring_tokens', 'code', 'code_tokens', 'short_docstring',
       'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
      dtype='object')

# Testing the Baseline model

Here we define the system prompt for the Llama 2 model.

In [6]:
def test_prompt(language, documentation, code):
    return \
    f'''You are a helpful agent designed to simplify code documentation for beginner programmers.
    You will be provided with a block of {language} code and the existing doucmentation that accompanies it.
    Simplify the given documentation, using the provided code as context, so that it is understandable
    to beginner programmers. Output absolutely nothing else besides the simplified documentation.
    Make sure to keep any documentation formatting codes present in the simplified documentation.
    If you feel that the existing documentation is simple enough and meaning would be lost by simplifying
    it further, feel free to keep the documentation as is. Here is the original documentation and code:\n
    Documentation:\n{documentation}\n\nCode:\n{code}'''

Creating the pipeline for the Gemma 3 model using the HuggingFace transformers library. Modified from the example here: https://huggingface.co/docs/transformers/v4.51.3/en/model_doc/gemma#gemma

In [7]:
"""pipe_gemma = pipeline(
    "text2text-generation",
    model="google/gemma-3-4b-it",
    torch_dtype=torch.float32,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)"""
pipe_gemma = pipeline(
    "text2text-generation",
    model="google/gemma-3-4b-it",
    torch_dtype=torch.float32,
    device="cpu"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
The model 'Gemma3ForConditionalGeneration' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditio

In [8]:
eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
"""message = [
    {"role": "system", "content": test_prompt(dataset['test'][0]['language'])},
    {"role": "user", "content": f"Documentation:\n{dataset['test'][0]['original_docstring']}\n\nCode:\n{dataset['test'][0]['original_string']}"}
]"""
lan = dataset['test'][0]['language']
code = dataset['test'][0]['original_string']
doc = dataset['test'][0]['original_docstring']
print(f"Original Documentation:\n{dataset['test'][0]['original_docstring']}\n")
print(f"Code:\n{dataset['test'][0]['original_string']}\n")
print("***********************Result************************************")
print(pipe_gemma(test_prompt(lan, doc, code), pad_token_id=pipe_gemma.tokenizer.eos_token_id, max_new_tokens=30)[0]['generated_text'])

Original Documentation:
Build the library mappings tables.

Code:
def build_mapping_tables(app):
    """Build the library mappings tables."""
    env = Environment(loader=FileSystemLoader(f"{DIR_PATH}"))
    template_file = env.get_template("table_template.j2")

    LIST_OF_MAP_DICTS = []
    for attr in dir(lib_mapper):
        if (attr.endswith("MAPPER_REVERSE") or attr.endswith("_MAPPER")) and not (
            attr.startswith("_") or attr.startswith("NETMIKO") or attr.startswith("MAIN")
        ):
            LIST_OF_MAP_DICTS.append(attr)

    for dict_name in LIST_OF_MAP_DICTS:
        lib_name = dict_name.split("_")[0]
        filename = f"{lib_name}_reverse" if "REVERSE" in dict_name else lib_name
        headers = ["NORMALIZED", lib_name] if "REVERSE" in dict_name else [lib_name, "NORMALIZED"]
        rendered_template = template_file.render(lib_names=headers, mappings=getattr(lib_mapper, dict_name))
        with open(f"{DIR_PATH}/netutils/lib_mapping/{filename}_table.rst", "w

In [10]:
gemma_semantic_similarities_untrained = []
gemma_metrics_untrained = evaluate.combine(['rouge', 'meteor'])

for instance in tqdm(dataset_sample.itertuples()):
    lan = instance.language
    code = instance.original_docstring
    doc = instance.original_string

    message = test_prompt(lan, doc, code)

    result = pipe_gemma(message, pad_token_id=pipe_gemma.tokenizer.eos_token_id, max_new_tokens=30)[0]['generated_text']

    #result = output.replace(message, "")

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    gemma_semantic_similarities_untrained.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    gemma_metrics_untrained.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to /home/adeniji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adeniji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adeniji/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0it [00:00, ?it/s]

In [None]:
print("Untrained Gemma 3: \n")
sims_untrianed = pd.DataFrame(gemma_semantic_similarities_untrained)
sims_untrianed.to_excel('results/Semantic_Similarities_Gemma_untrained.xlsx')
sims_untrianed.describe()

Untrained Gemma 3: 



Unnamed: 0,0
count,200.0
mean,0.502455
std,0.130216
min,0.128034
25%,0.418758
50%,0.515865
75%,0.604241
max,0.771373


In [None]:
import json
with open('results/rouge_meteor_gemma_untrained.json', 'w') as file:
    mr = gemma_metrics_untrained.compute()
    json.dump(mr, file, indent=4)

# Testing the FineTuned model

Here we Import the training examples from an excel file.

In [13]:
train_df = pd.read_excel("self_training_annotated.xlsx", sheet_name="Sheet1", usecols=[5, 8, 9, 16])
training_sample = train_df.groupby('language', group_keys=False).sample(n=5)
training_sample.columns

Index(['language', 'original_string', 'original_docstring',
       'modified_short_docstring'],
      dtype='object')

In [14]:
def training_prompt(language, documentation, code, modified):
    return \
    f'''You are a helpful agent designed to simplify code documentation for beginner programmers.
    You will be provided with a block of {language} code and the existing doucmentation that accompanies it.
    Simplify the given documentation, using the provided code as context, so that it is understandable
    to beginner programmers. Output absolutely nothing else besides the simplified documentation.
    Make sure to keep any documentation formatting codes present in the simplified documentation.
    If you feel that the existing documentation is simple enough and meaning would be lost by simplifying
    it further, feel free to keep the documentation as is. Here is the original documentation and code:\n
    Code:\n{code}\n\nDocumentation:\n{documentation}\n\nModified documentation:\n{modified}'''

In [16]:
for instance in tqdm(training_sample.itertuples()):
    lan = instance.language
    code = instance.original_docstring
    doc = instance.original_string
    mod = instance.modified_short_docstring

    message = training_prompt(lan, doc, code, mod)

    pipe_gemma(message, pad_token_id=pipe_gemma.tokenizer.eos_token_id, max_new_tokens=45)

0it [00:00, ?it/s]

In [22]:
pipe_gemma.device = "cpu"

In [24]:
gemma_semantic_similarities_trained = []
gemma_metrics_trained = evaluate.combine(['rouge', 'meteor'])


for instance in tqdm(dataset_sample.itertuples()):
    lan = instance.language
    code = instance.original_docstring
    doc = instance.original_string

    message = test_prompt(lan, doc, code)

    result = pipe_gemma(message, pad_token_id=pipe_gemma.tokenizer.eos_token_id, max_new_tokens=30)[0]['generated_text']

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    gemma_semantic_similarities_trained.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    gemma_metrics_trained.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to /home/adeniji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adeniji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adeniji/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0it [00:00, ?it/s]

AttributeError: 'str' object has no attribute 'type'

# Summary statistics results
Semantic Similarity

In [None]:
print("Trained Gemma 3: \n")
sims_trained = pd.DataFrame(gemma_semantic_similarities_trained)
sims_trained.to_excel('results/Semantic_Similarities_Gemma_trained.xlsx')
sims_trained.describe()

ROUGE AND METEOR

In [None]:
with open('results/rouge_meteor_gemma_trained.json', 'w') as file:
    mr = gemma_metrics_trained.compute()
    json.dump(mr, file, indent=4)
