In [24]:
from datasets import load_dataset
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image
import requests
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
#from tqdm import tqdm
from tqdm.notebook import tqdm
import evaluate
from sklearn.model_selection import train_test_split

In [7]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
#os.environ["TORCH_USE_CUDA_DSA"] = "1"

In [8]:
#dataset = load_dataset('code-search-net/code_search_net')
dataset = load_dataset("Fsoft-AIC/the-vault-function", split_set=["test"], trust_remote_code=True)

In [9]:
sample_df = pd.DataFrame(dataset['test'])
sample_df.columns

Index(['hexsha', 'repo', 'path', 'license', 'language', 'identifier',
       'return_type', 'original_string', 'original_docstring', 'docstring',
       'docstring_tokens', 'code', 'code_tokens', 'short_docstring',
       'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
      dtype='object')

In [12]:
dataset_sample = sample_df.groupby('language', group_keys=False).sample(n=20)
dataset_sample.columns

Index(['hexsha', 'repo', 'path', 'license', 'language', 'identifier',
       'return_type', 'original_string', 'original_docstring', 'docstring',
       'docstring_tokens', 'code', 'code_tokens', 'short_docstring',
       'short_docstring_tokens', 'comment', 'parameters', 'docstring_params'],
      dtype='object')

# Testing the Baseline model

Here we define the system prompt for the Llama 2 model.

In [15]:
def llm_prompt(language, comment, code):
    prompt_gemma = \
    '''You are a helpful agent designed to simplify code documentation for beginner programmers.
    You will be provided with a block of C code and the existing doucmentation that accompanies it.
    Using the provided code as context, give a simplified explanation of the code so that it is understandable
    to beginner programmers. Output absolutely nothing else besides the simplified explanation.
    Make sure to keep any documentation formatting codes present in the simplified explanation.
    If you feel that the existing documentation is simple enough and meaning would be lost by simplifying
    it further, feel free to keep the documentation as is. Here is the original documentation and code:'''
    
    return prompt_gemma

Creating the pipeline for the Gemma 3 model using the HuggingFace transformers library. Modified from the example here: https://huggingface.co/docs/transformers/v4.51.3/en/model_doc/gemma#gemma

In [10]:
# Load model and tokenizer
model_name = "google/gemma-3-4b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [16]:
lan = dataset['test'][0]['language']
code = dataset['test'][0]['original_string']
doc = dataset['test'][0]['original_docstring']
print(f"Original Documentation:\n{dataset['test'][0]['original_docstring']}\n")
print(f"Code:\n{dataset['test'][0]['original_string']}\n")
print("***********************Result************************************")
gemma_raw = llm_prompt(lan, doc, code)

#Tokenize Gemma raw
gemma_inputs = tokenizer(gemma_raw, return_tensors="pt").to(model.device)

# Generate output
with torch.no_grad():
    gemma_outputs = model.generate(**gemma_inputs, max_new_tokens=200, do_sample=True, temperature=0.7)

# Decode and print
print(tokenizer.decode(gemma_outputs[0], skip_special_tokens=True))

Original Documentation:
Build the library mappings tables.

Code:
def build_mapping_tables(app):
    """Build the library mappings tables."""
    env = Environment(loader=FileSystemLoader(f"{DIR_PATH}"))
    template_file = env.get_template("table_template.j2")

    LIST_OF_MAP_DICTS = []
    for attr in dir(lib_mapper):
        if (attr.endswith("MAPPER_REVERSE") or attr.endswith("_MAPPER")) and not (
            attr.startswith("_") or attr.startswith("NETMIKO") or attr.startswith("MAIN")
        ):
            LIST_OF_MAP_DICTS.append(attr)

    for dict_name in LIST_OF_MAP_DICTS:
        lib_name = dict_name.split("_")[0]
        filename = f"{lib_name}_reverse" if "REVERSE" in dict_name else lib_name
        headers = ["NORMALIZED", lib_name] if "REVERSE" in dict_name else [lib_name, "NORMALIZED"]
        rendered_template = template_file.render(lib_names=headers, mappings=getattr(lib_mapper, dict_name))
        with open(f"{DIR_PATH}/netutils/lib_mapping/{filename}_table.rst", "w

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
generated_documents = []
gemma_semantic_similarities_untrained = []
gemma_metrics_untrained = evaluate.combine(['rouge', 'meteor'])

for instance in tqdm(dataset_sample.itertuples()):
    language = instance.language
    original_doc = instance.original_docstring
    orignal_code = instance.original_string

    message = llm_prompt(language, original_doc, orignal_code)

    #Tokenize Gemma raw
    gemma_inputs = tokenizer(message, return_tensors="pt").to(model.device)

    # Generate output
    with torch.no_grad():
        gemma_outputs = model.generate(**gemma_inputs, max_new_tokens=200, do_sample=True, temperature=0.7)

    # Decode and print
    result = tokenizer.decode(gemma_outputs[0], skip_special_tokens=True)

    #result = result.replace(message, "")

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    gemma_semantic_similarities_untrained.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    gemma_metrics_untrained.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to /home/adeniji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adeniji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adeniji/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0it [00:00, ?it/s]

In [21]:
print("Untrained Gemma 3: \n")
sims_untrianed = pd.DataFrame(gemma_semantic_similarities_untrained)
sims_untrianed.to_excel('results/Semantic_Similarities_Gemma_untrained.xlsx')
sims_untrianed.describe()

Untrained Gemma 3: 



Unnamed: 0,0
count,200.0
mean,0.588644
std,0.214955
min,-0.035175
25%,0.49776
50%,0.63305
75%,0.726025
max,1.0


In [22]:
import json
with open('results/rouge_meteor_gemma_untrained.json', 'w') as file:
    mr = gemma_metrics_untrained.compute()
    json.dump(mr, file, indent=4)

# Testing the FineTuned model

Here we Import the training examples from an excel file.

In [20]:
train_df = pd.read_excel("self_training_annotated.xlsx", sheet_name="Sheet1", usecols=[5, 8, 9, 14, 16])
training_sample = train_df.groupby('language', group_keys=False).sample(n=5)
training_sample.columns

Index(['language', 'original_string', 'original_docstring', 'short_docstring',
       'modified_short_docstring'],
      dtype='object')

In [21]:
llm_set:list[dict] = []

for instance in tqdm(training_sample.itertuples()):
    language = instance.language
    original_doc = instance.original_docstring
    original_code = instance.original_string
    positive = instance.modified_short_docstring
    negative = instance.short_docstring

 
    llm_set.append({
        "code": original_code,
        "original_doc": original_doc,
        "candidate_sentence": positive,
        "label": 1
    })

    llm_set.append({
        "code": original_code,
        "original_doc": original_doc,
        "candidate_sentence": negative,
        "label": 0
    })

0it [00:00, ?it/s]

In [22]:
train_df = pd.DataFrame(llm_set)
train_df = train_df.dropna()
train_df.head()

Unnamed: 0,code,original_doc,candidate_sentence,label
0,static int cpr_reduce_ceiling_voltage(struct c...,/*\n * Conditionally reduce the per-virtual-co...,/*Accepts two structs. Checks If ceiling array...,1
1,static int cpr_reduce_ceiling_voltage(struct c...,/*\n * Conditionally reduce the per-virtual-co...,Conditionally reduce the per-virtual-corner ce...,0
2,LABIRINTO* alocalabirinto(void){\n LABIRINT...,//aloca a quantidade de memoria necessaria par...,// Allocates the amount of memory needed for a...,1
3,LABIRINTO* alocalabirinto(void){\n LABIRINT...,//aloca a quantidade de memoria necessaria par...,aloca a quantidade de memoria necessaria para ...,0
4,eModuleError_t eGattCommsSend( eCommsChannel_t...,/**\n * @brief Transfering data to a remote de...,/*Accepts a eCommsChannel_t and a xUnifiedCom...,1


In [25]:
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def training_prompt(language, documentation, code, modified):
    prompt_gemma = \
    '''You are a helpful agent designed to help beginner programmers learn the fundamentals of programming.
    You will be provided with a block of {LANGUAGE} code and the existing doucmentation that accompanies it.
    You will also be provided a simplified version of the documentation. Using the provided code as context,
    your goal is to understand how the documentation is simplified while being contextually relevant to the 
    original code. Output absolutely nothing. If you feel that the simplified documentation is simple and contextually 
    relevant learn from the example. If you do feel  the simplified documentaiton is not simple or not contextually relevant
    forget about the example. Here is the original documentation and code:\n 
    Code:\n{CODE}\n\nDocumentation:\n{DOCUMENTATION}\n\nModified documentation:\n{MODIFIED}'''.format(LANGUAGE=language, DOCUMENTATION=documentation, CODE=code, MODIFIED=modified)

In [None]:
for instance in tqdm(training_sample.itertuples()):
    language = instance.language
    original_doc = instance.original_docstring
    orignal_code = instance.original_string
    mod = instance.modified_short_docstring

    message = training_prompt(language, original_doc, orignal_code, mod)

    pipe_gemma(message, pad_token_id=pipe_gemma.tokenizer.eos_token_id, max_new_tokens=45)

0it [00:00, ?it/s]

In [None]:
gemma_semantic_similarities_trained = []
gemma_metrics_trained = evaluate.combine(['rouge', 'meteor'])


for instance in tqdm(dataset_sample.itertuples()):
    language = instance.language
    original_doc = instance.original_docstring
    orignal_code = instance.original_string

    message = llm_prompt(language, original_doc, orignal_code)

    #Tokenize Gemma raw
    gemma_inputs = tokenizer(message, return_tensors="pt").to(model.device)

    # Generate output
    with torch.no_grad():
        gemma_outputs = model.generate(**gemma_inputs, max_new_tokens=200, do_sample=True, temperature=0.7)

    # Decode and print
    result = tokenizer.decode(gemma_outputs[0], skip_special_tokens=True)

    #result = result.replace(message, "")

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    gemma_semantic_similarities_trained.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    gemma_metrics_trained.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to /home/adeniji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/adeniji/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/adeniji/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0it [00:00, ?it/s]

# Summary statistics results
Semantic Similarity

In [19]:
print("Trained Gemma 3: \n")
sims_trained = pd.DataFrame(gemma_semantic_similarities_trained)
sims_trained.to_excel('results/Semantic_Similarities_Gemma_trained.xlsx')
sims_trained.describe()

Trained Gemma 3: 



Unnamed: 0,0
count,200.0
mean,0.504685
std,0.141379
min,0.051231
25%,0.409741
50%,0.516998
75%,0.609724
max,0.840501


ROUGE AND METEOR

In [20]:
with open('results/rouge_meteor_gemma_trained.json', 'w') as file:
    mr = gemma_metrics_trained.compute()
    json.dump(mr, file, indent=4)
