In [2]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
import torch

model_path = "bigcode/starcoder2-3b"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")


In [6]:
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [7]:
pip install datasets




In [12]:
dataset = load_dataset("code_search_net", "python", split="train")

In [13]:
dataset[0]  # show the first example


{'repository_name': 'ArabellaTech/django-basic-cms',
 'func_path_in_repository': 'basic_cms/templatetags/pages_tags.py',
 'func_name': 'show_slug_with_level',
 'whole_func_string': 'def show_slug_with_level(context, page, lang=None, fallback=True):\n    """Display slug with level by language."""\n    if not lang:\n        lang = context.get(\'lang\', pages_settings.PAGE_DEFAULT_LANGUAGE)\n\n    page = get_page_from_string_or_id(page, lang)\n    if not page:\n        return \'\'\n\n    return {\'content\': page.slug_with_level(lang)}',
 'language': 'python',
 'func_code_string': 'def show_slug_with_level(context, page, lang=None, fallback=True):\n    """Display slug with level by language."""\n    if not lang:\n        lang = context.get(\'lang\', pages_settings.PAGE_DEFAULT_LANGUAGE)\n\n    page = get_page_from_string_or_id(page, lang)\n    if not page:\n        return \'\'\n\n    return {\'content\': page.slug_with_level(lang)}',
 'func_code_tokens': ['def',
  'show_slug_with_level',


In [15]:
from datasets import load_dataset, DatasetDict
dataset = dataset.shuffle(seed=42)
split_dataset = dataset.train_test_split(test_size=0.2)

# Optional: split the test set again into val + test
split_dataset = DatasetDict({
    "train": split_dataset["train"],
    "val": split_dataset["test"].train_test_split(test_size=0.5, seed=42)["train"],
    "test": split_dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
})

In [16]:
split_dataset["train"]  # 80%
split_dataset["val"]    # 10%
split_dataset["test"]   # 10%


Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 41218
})

In [21]:
import random

def format_multitask(example):
    code = example.get("func_code_string")
    doc = example.get("func_documentation_string")

    if not code or not doc:
        return {"text": ""}  # skip incomplete samples

    first_line = code.splitlines()[0] if code else ""

    tasks = [
        f"### Instruction:\nWrite a function for this description:\n{doc}\n\n### Response:\n{code}",
        f"### Instruction:\nExplain what this function does:\n{code}\n\n### Response:\n{doc}",
        f"### Instruction:\nComplete the following function:\n{first_line}\n\n### Response:\n{code}",
        f"### Instruction:\nImprove the readability of this function:\n{code}\n\n### Response:\n{code}"
    ]

    return {"text": random.choice(tasks)}


In [22]:
print(split_dataset["train"].column_names)


['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']


In [23]:
formatted_dataset = split_dataset.map(format_multitask)


Map:   0%|          | 0/329742 [00:00<?, ? examples/s]

Map:   0%|          | 0/41218 [00:00<?, ? examples/s]

Map:   0%|          | 0/41218 [00:00<?, ? examples/s]

In [24]:
from datasets import DatasetDict

formatted_dataset = DatasetDict({
    split: formatted_dataset[split].remove_columns(
        [col for col in formatted_dataset[split].column_names if col != "text"]
    )
    for split in formatted_dataset
})


In [25]:
print(formatted_dataset["train"][0]["text"])


### Instruction:
Write a function for this description:
set general brightness in range 0...1

### Response:
def set_brightness(self, brightness):
        """set general brightness in range 0...1"""
        brightness = min([1.0, max([brightness, 0.0])]) # enforces range 0 ... 1
        self.state.brightness = brightness
        self._repeat_last_frame()
        sequence_number = self.zmq_publisher.publish_brightness(brightness)
        logging.debug("Set brightness to {brightPercent:05.1f}%".format(brightPercent=brightness*100))
        return (True, sequence_number, "OK")


In [27]:
from transformers import AutoTokenizer

model_path = "bigcode/starcoder2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token


def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024  # or 2048 if you're feeling adventurous
    )

tokenized_dataset = formatted_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/329742 [00:00<?, ? examples/s]

Map:   0%|          | 0/41218 [00:00<?, ? examples/s]

Map:   0%|          | 0/41218 [00:00<?, ? examples/s]

In [28]:
print(tokenizer.pad_token, tokenizer.pad_token_id)


<|endoftext|> 0


In [29]:
small_tokenized_dataset = DatasetDict({
    split: tokenized_dataset[split].select(range(min(len(tokenized_dataset[split]), size)))
    for split, size in {"train": 1000, "val": 200, "test": 200}.items()
})


In [31]:
print(tokenizer.pad_token, tokenizer.pad_token_id)

<|endoftext|> 0


In [32]:
print(small_tokenized_dataset["train"][0]["input_ids"])


[1502, 21052, 63, 222, 2553, 331, 686, 456, 477, 3066, 63, 222, 489, 8108, 29331, 347, 2189, 244, 53, 1198, 54, 222, 222, 1502, 5178, 63, 222, 610, 758, 100, 29057, 45, 803, 49, 29331, 731, 310, 1547, 489, 8108, 29331, 347, 2189, 244, 53, 1198, 54, 3012, 310, 29331, 299, 1865, 2034, 54, 51, 53, 49, 1788, 2034, 29057, 49, 244, 53, 51, 53, 1156, 1156, 607, 949, 39186, 2189, 244, 53, 2437, 244, 54, 310, 649, 51, 1311, 51, 29057, 299, 29331, 310, 649, 1132, 8116, 100, 2153, 100, 1763, 365, 310, 6645, 100, 2188, 299, 649, 51, 47001, 100, 15076, 51, 7226, 100, 29057, 45, 29057, 46, 310, 5751, 51, 2824, 459, 903, 29331, 391, 320, 28838, 12270, 63, 53, 58, 51, 54, 107, 16242, 2316, 1664, 45, 28838, 12270, 66, 29057, 47, 54, 53, 53, 509, 310, 461, 327, 1844, 49, 6645, 100, 2188, 49, 332, 2966, 678, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [33]:
pip install --upgrade transformers




In [34]:
import transformers
print(transformers.__version__)  # should be >= 4.40


4.52.4


In [49]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./codex-finetune",
    label_names=["input_ids"],
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    logging_dir="./logs",
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    gradient_checkpointing=True,
    learning_rate=2e-4,
)


In [50]:
from transformers import Trainer, DataCollatorForLanguageModeling

# Collator for Causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not masked LM, it's causal
)


# Boom. Here's the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_tokenized_dataset["train"],
    eval_dataset=small_tokenized_dataset["val"],
    data_collator=data_collator,

)


In [51]:
model.gradient_checkpointing_enable()


In [53]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [54]:
trainer.train()


Step,Training Loss
500,1.7007
1000,1.3051
1500,1.2345


Step,Training Loss
500,1.7007
1000,1.3051
1500,1.2345
2000,1.2294
2500,1.1852
3000,1.2034


TrainOutput(global_step=3000, training_loss=1.3097034505208334, metrics={'train_runtime': 4274.3363, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.702, 'total_flos': 5.3114566606848e+16, 'train_loss': 1.3097034505208334, 'epoch': 3.0})

In [55]:
trainer.save_model("codex-finetune-final")
tokenizer.save_pretrained("codex-finetune-final")


('codex-finetune-final/tokenizer_config.json',
 'codex-finetune-final/special_tokens_map.json',
 'codex-finetune-final/vocab.json',
 'codex-finetune-final/merges.txt',
 'codex-finetune-final/added_tokens.json',
 'codex-finetune-final/tokenizer.json')

In [57]:
trainer.push_to_hub("khushimalik53/coding_copilot")
tokenizer.push_to_hub("khushimalik53/coding_copilot")


Uploading...:   0%|          | 0.00/9.12M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/khushimalik53/coding_copilot/commit/f5978c6f8c8790e76ce1fdcaa843f83c51f1bb83', commit_message='Upload tokenizer', commit_description='', oid='f5978c6f8c8790e76ce1fdcaa843f83c51f1bb83', pr_url=None, repo_url=RepoUrl('https://huggingface.co/khushimalik53/coding_copilot', endpoint='https://huggingface.co', repo_type='model', repo_id='khushimalik53/coding_copilot'), pr_revision=None, pr_num=None)

In [None]:
metrics = trainer.evaluate(eval_dataset=formatted_dataset["test"])
print(metrics)

In [None]:
input_text = "### Instruction:\nExplain what this function does:\ndef add(x, y):\n    return x + y\n\n### Response:\n"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
