In [1]:
from datasets import load_dataset, Dataset, DatasetDict
import re
from copy import deepcopy
from tqdm.notebook import tqdm
from radon.metrics import mi_visit

# Ignore SyntaxWarnings that can occur during dynamic code evaluation
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [2]:
# The function adds the function signature to ensure the model define the main function with the correct name and parameters.
def add_function_signature(custom_dataset):
    new_dataset = deepcopy(custom_dataset)
    new_dataset = new_dataset.to_list()
    
    for i, example in enumerate(new_dataset):
        match = re.finditer(r"def\s+[\w_]+\(.*\)\s*", example['code'])
        match_list = list(match)
        
        if len(match_list) != 0: # If there is at least one match, which is supposed to be always
            function_signature = match_list[-1].group(0).strip()
            additional_text = f"\nThe main function is defined by the function signature: {function_signature}\n"
            new_dataset[i]["text"] = example['text'] + additional_text
    
    new_dataset = Dataset.from_list(new_dataset)
    return new_dataset

In [3]:
# Load the MBPP dataset
mbpp = load_dataset("mbpp")

# Create a custom dataset without the 'class' keyword in the code - This is to make sure we only have code that does not involve unnecessary complexity. Only 2 such examples exist in the train split, so it's a negligible change.
filtered_data = [example for example in mbpp['train'] if 'class' not in example['code']]
custom_dataset = Dataset.from_list(filtered_data)

# Add function signatures to the custom dataset
added_signature_dataset = add_function_signature(custom_dataset)

# Save the preprocessed dataset to disk
added_signature_dataset.save_to_disk("mbpp_preprocessed_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/372 [00:00<?, ? examples/s]

In [4]:
mbpp_test = mbpp['test']
added_signature_test_dataset = add_function_signature(mbpp_test)
added_signature_test_dataset.save_to_disk("mbpp_test_with_signatures")

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]