In [1]:
from datasets import load_dataset, Dataset
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm
import evaluate
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from trl import SFTConfig, SFTTrainer, setup_chat_format
from accelerate import PartialState
from sklearn.model_selection import train_test_split

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Loading Dataset

Load annotated training sample. We will be using `original_docstring` for original documentation, `modified_short_docstring` for simplified documentation, and `original_string` for code.

In [3]:
df = pd.read_csv('annotations_labeled.csv')
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Fine-Tuning the Baseline model

Here we define the system prompt for the Llama 2 model.

In [4]:
prompt = \
'''You are a helpful agent designed to determine whether code documentation is simple enough for 
beginner programmers to understand. You will be provided with a block of code and the 
doucmentation that accompanies it. Using the provided code as context, output "1" if you believe
the documentation is simple enough for beginner programmers to understand or "0" if you belive it 
is too difficult for beginner programmers to understand. Output only a single digit "0" or "1" and 
absolutely nothing else. Here is the original documentation and code:'''

Fine tune Llama 2 model using the HuggingFace transformers library. Modified from the example here: https://colab.research.google.com/github/dvgodoy/FineTuningLLMs/blob/main/Chapter0.ipynb#scrollTo=f5ad7668

In [5]:
device_string = PartialState().process_index
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map={'':device_string})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# model = prepare_model_for_kbit_training(model)

# config = LoraConfig(
#     r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train
#     lora_alpha=16,         # multiplier, usually 2*r
#     bias="none",           # BEWARE: training biases *modifies* base model's behavior
#     lora_dropout=0.05,
#     task_type="CAUSAL_LM",
#     # Newer models, such as Phi-3 at time of writing, may require
#     # manually setting target modules
#     modules_to_save=["lm_head", "embed_tokens"],
#     target_modules="all-linear",
# )

# model = get_peft_model(model, config)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [28]:
# Adapted from trl.extras.dataset_formatting.instructions_formatting_function
# Converts dataset to the conversational format
dataset_train = Dataset.from_pandas(train_df)
def format_dataset(example):
    message = [
        # {"role": "system", "content": prompt},
        {"role": "user", "content": f"Documentation:\n{example['original_docstring']}\n\nCode:\n{example['original_string']}"},
        {"role": "assistant", "content": f"{example['label']}"}
    ]
    return {'messages': message}
# train_cols.remove('modified_short_docstring')
dataset_train = dataset_train.map(format_dataset).select_columns('messages')

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [9]:
dataset_train

Dataset({
    features: ['messages'],
    num_rows: 140
})

In [10]:
instance = dataset_train[0]['messages'][:-1]
inputs = tokenizer.apply_chat_template(instance, return_tensors='pt').to(model.device)
output = model.generate(inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
response = tokenizer.decode(output[0], skip_special_tokens=True)
response

'[INST] <<SYS>>\nYou are a helpful agent designed to determine whether code documentation is simple enough for \nbeginner programmers to understand. You will be provided with a block of code and the \ndoucmentation that accompanies it. Using the provided code as context, output "1" if you believe\nthe documentation is simple enough for beginner programmers to understand or "0" if you belive it \nis too difficult for beginner programmers to understand. Output only a single digit "0" or "1" and \nabsolutely nothing else. Here is the original documentation and code:\n<</SYS>>\n\nDocumentation:\n// Search Book with suggest typeahead.js\n\nCode:\npublic function fetch_book_suggest()\r\n    {\r\n        $query = $this->request->getVar(\'query\');\r\n\r\n        $result = $this->book_model->like(\'title\', $query)->where(\'status\', \'Available\')->get()->getResultArray();\r\n\r\n        if (count($result) > 0) {\r\n            foreach ($result as $row) {\r\n            $output[] = $row[\'tit

In [29]:
# Function to get length of each sample in tokens
def get_token_length(example):
    # tokens = tokenizer(example["text"], truncation=False)["input_ids"]
    # return {"length": len(tokens)}
    tokens = tokenizer.apply_chat_template(example["messages"])
    return {"length": len(tokens)}

# Map through dataset to compute token lengths
dataset_with_lengths = dataset_train.map(get_token_length)

# Find the maximum
max_tokens = max(dataset_with_lengths["length"])
print(f"Maximum number of tokens in the dataset: {max_tokens}")

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Maximum number of tokens in the dataset: 2296


In [37]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,
    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,
    
    ## GROUP 2: Dataset-related
    # max_length=2300,
    max_seq_length=2300,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,
    # label_names=["messages"],
    eos_token=tokenizer.eos_token,
    
    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=1e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./baseline-finetuned',
    report_to='none'
)    

In [38]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset_train,
    compute_metrics=None
)

Converting train dataset to ChatML:   0%|          | 0/140 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/140 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [39]:
trainer.train()

RuntimeError: The size of tensor a (3) must match the size of tensor b (23) at non-singleton dimension 0

Loading the evaluation model used for computing semantic similarity. Taken from example here: https://huggingface.co/tasks/sentence-similarity

In [None]:
eval_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Running inference on the dataset sample

In [None]:
semantic_similarities = []
metrics = evaluate.combine(['rouge', 'meteor'])

for instance in tqdm(dataset_sample.itertuples()):
    message = [
        {"role": "system", "content": prompt(instance.language)},
        {"role": "user", "content": f"Documentation:\n{instance.original_docstring}\n\nCode:\n{instance.original_string}"}
    ]

    result = pipe(message, pad_token_id=pipe.tokenizer.eos_token_id)[0]['generated_text'][-1]['content']

    embedding_original = eval_model.encode(instance.original_docstring, convert_to_tensor=True)
    embedding_predicted = eval_model.encode(result, convert_to_tensor=True)

    semantic_similarities.append(util.pytorch_cos_sim(embedding_original, embedding_predicted).item())
    metrics.add(predictions=result, references=instance.original_docstring)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/j/jwoods03/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
200it [26:54,  8.07s/it]


Summary statistics for semantic similarity results

In [None]:
sims = pd.DataFrame(semantic_similarities)
sims.to_excel('semantic_similarities_baseline.xlsx')
sims.describe()

Unnamed: 0,0
count,200.0
mean,0.663492
std,0.178473
min,-0.001194
25%,0.569443
50%,0.687993
75%,0.789031
max,0.946113


ROUGE and METEOR results

In [None]:
import json
with open('rouge_meteor_baseline.json', 'w') as file:
    mr = metrics.compute()
    json.dump(mr, file, indent=4)