In [1]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
# # !pip install werkzeug>=1.0.1
# # !pip install markdown>=2.6.8
# # !pip install trl
# # !pip install tf-keras
# # !pip install wandb
# !pip install -U peft
# !pip install -U datasets
# !pip install -U bitsandbytes
# !pip install -U transformers
# !pip install -U accelerate

# for google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
from huggingface_hub import login
hf_token = ''
login(hf_token)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\jc\.cache\huggingface\token
Login successful


In [3]:
import os
import sys
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)




In [4]:
# check if torch detects GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
else:
    raise Exception('GPU not detected')

In [5]:
os.environ["WANDB_DISABLED"] = "true"

# OUTPUT_DIR = "/content/drive/MyDrive/TFM/fine-tuning"
OUTPUT_DIR = "./fine-tuning"

BASE_MODEL = "meta-llama/Meta-Llama-3-8B"


In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [01:24<00:00, 21.18s/it]


In [7]:
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
train_dataset = dataset.train_test_split(test_size=0.2)["train"]
eval_dataset = dataset.train_test_split(test_size=0.2)["test"]

In [8]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

In [9]:
def generate_and_tokenize_prompt(data_point):
    instruction = data_point["instruction"]
    output = data_point["output"]

    full_prompt =f"""
You are a powerful text-to-Python model.
Your job is to answer questions about a Python.
You are given a question regarding Python code.
You must output the Python code that answers the question tabulated correctly.
Only one response was allowed.
### Input:
{instruction}

### Response:
{output}
"""
    return tokenize(full_prompt)

In [10]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map: 100%|██████████| 14889/14889 [00:06<00:00, 2462.44 examples/s]
Map: 100%|██████████| 3723/3723 [00:01<00:00, 2446.12 examples/s]


In [11]:
model.train() # put model back into training mode
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.config.use_cache = False

In [12]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

In [13]:
batch_size = 128
per_device_train_batch_size = 8 # increase when GPU RAM is higher
gradient_accumulation_steps = batch_size // per_device_train_batch_size

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        eval_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=OUTPUT_DIR,
        load_best_model_at_end=False,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [14]:
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

In [None]:
trainer.train()
# trainer.train(resume_from_checkpoint=True)

  return fn(*args, **kwargs)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained(f"{OUTPUT_DIR}/fine_tuned_llama")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/fine_tuned_llama")