In [1]:
#Part A of the script

#Load the dataset.
#Load and prepare the model and tokenizer.
#Save the dataset, model, and tokenizer for later use.

In [2]:
pip install transformers datasets

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m167.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m330.6 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m323.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex!=2019.12.17
  Downloading regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m323.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub

In [11]:
import torch
if torch.cuda.is_available():
    print("CUDA is available. GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

CUDA is available. GPU: Tesla T4
Using device: cuda


In [12]:
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, concatenate_datasets

In [5]:
# Load the HumanEval-X dataset for all languages
print("Loading the HumanEval-X datasets...")
humaneval_x_dataset_py = load_dataset("THUDM/humaneval-x", "python")
humaneval_x_dataset_cpp = load_dataset("THUDM/humaneval-x", "cpp")
humaneval_x_dataset_go = load_dataset("THUDM/humaneval-x", "go")
humaneval_x_dataset_java = load_dataset("THUDM/humaneval-x", "java")
humaneval_x_dataset_js = load_dataset("THUDM/humaneval-x", "js")
# humaneval_x_dataset_rust = load_dataset("THUDM/humaneval-x", "rust")

# Combine the datasets
print("Combining dataset...")
combined_dataset = concatenate_datasets([  humaneval_x_dataset_py["test"], humaneval_x_dataset_cpp["test"],
                                         humaneval_x_dataset_go["test"],humaneval_x_dataset_java["test"] ,
                                         humaneval_x_dataset_js["test"]])

Loading the HumanEval-X datasets...
Combining dataset...


In [39]:
# Load the HumanEval-XL dataset for all languages
print("Loading the HumanEval-XL datasets...")

programming_languages = [
    "python", "go", "java", "javascript", "scala", "csharp",
     "kotlin", "php", "perl", "ruby", "swift", "typescript"
]
# 23 natural languages each having 80 rows of data makes up 1840 rows of data per programming language

# List to collect datasets
datasets = []

for programming_language in programming_languages:
    programming_lang_dataset = load_dataset("FloatAI/HumanEval-XL", programming_language)
    
    concatenated_dataset = concatenate_datasets( [part for part in programming_lang_dataset.values()])
    datasets.append(concatenated_dataset)

#print(datasets)
# Optionally concatenate all datasets into one
combined_dataset = concatenate_datasets(datasets)
print(combined_dataset)

Loading the HumanEval-XL datasets...
Dataset({
    features: ['task_id', 'language', 'prompt', 'description', 'test', 'entry_point', 'canonical_solution', 'natural_language'],
    num_rows: 22080
})


In [32]:
# Save combined dataset
print("Saving combined dataset...")
torch.save(combined_dataset, "combined_dataset.pt")


Saving combined dataset...


In [33]:
model_name = "Salesforce/codegen-350M-mono"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [34]:
# Save the model and tokenizer
print("Saving model and tokenizer...")
tokenizer.save_pretrained("tokenizer")
model.save_pretrained("model")

Saving model and tokenizer...


In [35]:
import re

def generate_code(prompt, model, tokenizer, device, max_new_tokens=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, num_return_sequences=1)
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Post-process to keep only the main function and ensure it ends properly
    if "def " in generated_code:
        # Extract the main function including everything after it
        main_function = generated_code.split("def ", 1)[1]
        main_function = "def " + main_function  # Re-add the def keyword
        
        # Detect end of function by finding the next function or end of indentation
        lines = main_function.split('\n')
        function_lines = []
        inside_function = False
        indentation_level = None
        
        for line in lines:
            stripped_line = line.strip()
            if stripped_line.startswith("def ") and inside_function:
                break  # New function starts, stop collecting lines
            if not inside_function:
                if stripped_line.startswith("def "):
                    inside_function = True
                    indentation_level = len(line) - len(line.lstrip())
            if inside_function:
                if stripped_line == "" or line.startswith(" " * indentation_level):
                    function_lines.append(line)
                else:
                    break  # End of the function when the indentation changes

        complete_function = "\n".join(function_lines)
        return complete_function
    return generated_code


In [36]:
# Save the generate_code function
import dill
with open("generate_code.pkl", "wb") as f:
    dill.dump(generate_code, f)