In [7]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [8]:
pip install datasets



In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

In [25]:
def prepare_pretrained_model(model_name="Salesforce/codegen-350M-mono"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    return tokenizer, model


def preprocess(example, tokenizer):
    # causal masking internally blocks to know the information of output with the algorithm model can see tokens up to the current token
    prompt = f"Generate Python code: {example['func_documentation_string']}"
    code = example["func_code_string"]

    # tokenize, with ratio 150:874 for prompt, code respectively

    prompt_inputs = tokenizer(prompt, truncation=True, max_length=150, padding="max_length")
    code_inputs = tokenizer(code, truncation=True, max_length=874, padding="max_length")

    inputs = {
        "input_ids": prompt_inputs["input_ids"] + code_inputs["input_ids"],
        "attention_mask": prompt_inputs["attention_mask"] + code_inputs["attention_mask"]
    }

    # Ensure the total length does not exceed 1024
    inputs["input_ids"] = inputs["input_ids"][:1024]
    inputs["attention_mask"] = inputs["attention_mask"][:1024]

    # Pad to 1024 if necessary
    padding_length = 1024 - len(inputs["input_ids"])
    inputs["input_ids"] += [tokenizer.pad_token_id] * padding_length
    inputs["attention_mask"] += [0] * padding_length

    labels = inputs["input_ids"].copy()  # ~1024 tokens

    # mask the prompt to only generate the coding part
    prompt_len = len(prompt_inputs.input_ids)  # ~150
    labels[:prompt_len] = [-100]*prompt_len
    inputs["labels"] = labels

    return inputs


def prepare_dataset(model_name="Salesforce/codegen-350M-mono"):
    dataset = load_dataset("code_search_net", "python", trust_remote_code=True)
    tokenizer, model = prepare_pretrained_model(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    train = dataset["train"].shuffle(seed=42).select(range(100000)).map(preprocess, fn_kwargs={"tokenizer": tokenizer}, remove_columns=dataset["train"].column_names)
    valid = dataset["validation"].shuffle(seed=42).select(range(20000)).map(preprocess, fn_kwargs={"tokenizer": tokenizer}, remove_columns=dataset["validation"].column_names)

    return tokenizer, model, train, valid


def fine_tune_pretrained_model(tokenizer, model, train, valid):
    # tokenizer, model = prepare_pretrained_model(model_name)
    # tokenizer.pad_token = tokenizer.eos_token
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    # train = dataset["train"].map(preprocess, fn_kwargs={"tokenizer": tokenizer}, remove_columns=dataset["train"].column_names)
    # valid = dataset["validation"].map(preprocess, fn_kwargs={"tokenizer": tokenizer}, remove_columns=dataset["validation"].column_names)

    training_args = TrainingArguments(
        output_dir="./codegen-finetuned",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=100,
        fp16=torch.cuda.is_available(),
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer
    )

    # train
    trainer.train()

    # save
    trainer.save_model("./codegen-finetuned")
    tokenizer.save_pretrained("./codegen-finetuned")


In [26]:
tokenizer, model, train, valid = prepare_dataset()

Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [27]:
fine_tune_pretrained_model(tokenizer, model, train, valid)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2887,0.315531
2,0.1965,0.316068
