In [1]:
!pip install 

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
from tree_sitter import Language

from language_data import LANGUAGE_METADATA
from process import DataProcessor

In [3]:
language = 'java'

In [4]:
DataProcessor.PARSER.set_language(Language('../src/build/py-tree-sitter-languages.so', language))

processor = DataProcessor(language=language,
                          language_parser=LANGUAGE_METADATA[language]['language_parser'])

In [5]:
definitions = processor.process_dee("JetBrains/intellij-community", ext=LANGUAGE_METADATA[language]['ext'])

In [6]:
len(definitions)

205557

In [7]:
import re
# Would be nice to use AST to parse the function and remove the name more nicely, but
# not time for that
# Also filtering out @overrides would be also good idea I think
def obfuscate_function_java(source_code, old_name, new_name=""):
    source_code = re.sub(rf"{old_name}", new_name, source_code)
    return source_code

def non_class_name(name):
    return name.split('.')[-1]


In [8]:
from datasets import Dataset
dataset = Dataset.from_list(definitions)

In [9]:
def prepare(x):
    function_name = non_class_name(x["identifier"])
    return {
        "function_name": function_name,
        "source_code": obfuscate_function_java(x["function"], function_name, new_name="x")
    }

dataset = dataset.map(prepare, num_proc=8)

Map (num_proc=8):   0%|          | 0/205557 [00:00<?, ? examples/s]

In [10]:
# Let's keep some space for prompt + function name, we will be using 1024
# We could go higher, but I have only 3090 and want to get the training done with higher batch size
def remove_lengthy_examples(example):
    return len(example["input_ids"]) < 950

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf")
dataset = dataset.map(lambda example: tokenizer(example["source_code"], padding=False))

dataset = dataset.filter(remove_lengthy_examples, num_proc=8)

Map:   0%|          | 0/205557 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/205557 [00:00<?, ? examples/s]

In [11]:
from datasets import Dataset

def remove_duplicates(dataset: Dataset) -> Dataset:
    # Create a set of unique source codes
    unique_source_codes = dict()
    for i, example in enumerate(dataset):
        unique_source_codes[example["source_code"]] = i

    # Sort the unique samples by their index
    return dataset.select(sorted(unique_source_codes.values()))


dataset = remove_duplicates(dataset)

In [21]:
# Create the tempalte for CodeLLama
# We will use sharegpt format in axolotl
def add_messages(example):
    return {
        "conversations": [
                    {"from": "system", "value": f"Given the source code of a java function, suggest a fitting name for the function."},
                     {"from": "human", "value": example["source_code"]},
                     {"from": "gpt", "value": example["function_name"]}]
    }

dataset = dataset.map(add_messages, num_proc=8)

Map (num_proc=8):   0%|          | 0/178628 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [13]:
# Now separate into train and test
from datasets import DatasetDict
dataset = DatasetDict({
    "train": dataset.select(range(len(dataset)-5000)),
    "test": dataset.select(range(len(dataset)-5000, len(dataset)))
})


In [23]:
# Remove the uneeded columns
dataset = dataset.remove_columns(list(set(dataset["train"].column_names) - set(["conversations", "source_code", "function_name"])))

In [24]:
# YOu will need a hf token for this
dataset.push_to_hub("hynky/jetbrains-community-function_name")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/179 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/427 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/hynky/jetbrains-community-function_name/commit/1a73a649e1d2fef328f3d66ec4c2ccedd5ca3d99', commit_message='Upload dataset', commit_description='', oid='1a73a649e1d2fef328f3d66ec4c2ccedd5ca3d99', pr_url=None, pr_revision=None, pr_num=None)