- The %%capture magic command is used to suppress the output of the cell in Jupyter notebook.
- The %pip magic command is used to install Python packages within a Jupyter notebook: accelerate, peft, bitsandbytes, transformers, and trl are the names of the Python packages being installed.
- These packages are installed in the current Python environment running the Jupyter notebook.

In [1]:
%%capture
# Uncomment if you haven't these packages
%pip install --upgrade accelerate peft bitsandbytes tensorboard trl huggingface_hub
%pip install "transformers==4.38.2" # Bug occured in v4.39.1 - AttributeError: 'torch.dtype' object has no attribute 'itemsize'
%pip install flash-attn --no-build-isolation #Nvidia download guide - https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [4]:
def format_prompts_func(dataset):
    formatted_dataset = []
    for i in range(len(dataset['instruction'])):
        text = f"<s>[INST] {dataset['instruction'][i]} [/INST] {dataset['answer'][i]} </s>"
        formatted_dataset.append(text)
    return formatted_dataset

In [5]:
# Model from Hugging Face hub
base_model = "mistralai/Mixtral-8x7B-Instruct-v0.1" #"mistralai/Mistral-7B-Instruct-v0.2"
# Fine-tuned model
new_model = "mixtral-8x7B-dockerfile-generation" #"mixtral-7Bv0.2-dockerfile-generation"
# Load the model
#dataset = load_dataset('json', data_files='dataset.jsonl', split='train').shuffle(seed=42)
tmp = load_dataset('mesolitica/mixtral-magicoder',data_files='data/dockerfile-00000-of-00001.jsonl',split='train').train_test_split(test_size=0.2)
dataset = tmp['train']
dataset_eval = tmp['test']

In [6]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    attn_implementation="flash_attention_2",
    quantization_config=quant_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.enable_input_require_grads() # Warning about gradients during generation

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [8]:
# Load the tokenizer from Hugginface and set padding_side to “right” to fix the issue with fp16
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

List of hyperparameters that can be used to optimize the training process:

- **output_dir**: The output directory is where the model predictions and checkpoints will be stored.
- **num_train_epochs**: One training epoch.
- **fp16/bf16**: Disable fp16/bf16 training.
- **per_device_train_batch_size**: Batch size per GPU for training.
- **per_device_eval_batch_size**: Batch size per GPU for evaluation.
- **gradient_accumulation_steps**: This refers to the number of steps required to accumulate the gradients during the update process.
- **gradient_checkpointing**: Enabling gradient checkpointing.
- **max_grad_norm**: Gradient clipping.
- **learning_rate**: Initial learning rate.
- **weight_decay**: Weight decay is applied to all layers except bias/LayerNorm weights.
- **Optim**: Model optimizer (AdamW optimizer).
- **lr_scheduler_type**: Learning rate schedule.
- **max_steps**: Number of training steps.
- **warmup_ratio**: Ratio of steps for a linear warmup.
- **group_by_length**: This can significantly improve performance and accelerate the training process.
- **save_steps**: Save checkpoint every 25 update steps.
- **logging_steps**: Log every 25 update steps.

In [9]:
output_dir = "./results_test_mixstral_70b"
if not os.path.isdir(new_model):
    peft_params = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules="all-linear"
    )
    model.add_adapter(peft_params, adapter_name="PEFT_CUSTOM")
    model.set_adapter("PEFT_CUSTOM")

    training_params = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        optim="paged_adamw_8bit",
        save_steps=500,
        logging_steps=250,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to="none",
    )
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_params,
        max_seq_length=1024,
        tokenizer=tokenizer,
        formatting_func=format_prompts_func,
        args=training_params,
        packing=False,
    )

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 31.74 GiB total capacity; 30.33 GiB already allocated; 8.88 MiB free; 30.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
if not os.path.isdir(new_model):
    # Train the model
    rfc = False
    # If the chekpoint folder doesn't exist or is empty don't load checkpoint
    if os.path.isdir(output_dir) and len(os.listdir(output_dir)) > 0:
        rfc = True
    trainer.train(resume_from_checkpoint=rfc)
    # Save the model
    trainer.model.save_pretrained(save_directory=new_model)
    trainer.tokenizer.save_pretrained(save_directory=new_model)

In [None]:
def generate_text(tokenizer, model, prompt: str) -> str:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # gen_tokens = model.generate(input_ids, max_new_tokens=512, do_sample=True, temperature=0.3, penalty_alpha=0.6, top_k=4) 
    gen_tokens = model.generate(input_ids, max_new_tokens=512, num_beams=5, no_repeat_ngram_size=2, early_stopping=False,pad_token_id=tokenizer.eos_token_id)
    return tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response

In [None]:
def clean_response(input: str) -> str:
    tmp = input.split("\n")
    if len(tmp) < 3:
        raise ValueError("Returned response isn't consistent")
    # Check if 2nd line is blank or not and remove everything before it
    tmp = tmp[3:] if tmp[2] == "" else tmp[2:]
    try:
        # Find any occurrence of the first paragraph
        idx = tmp.index(tmp[0], 1)
        # If found then return only the first part, before the repetition
        return '\n'.join(tmp[:idx])
    except ValueError:
        # If not found return everything
        return '\n'.join(tmp)

In [None]:
text1 = generate_text(tokenizer, model, "Generate a Dockerfile of Python 2.7, I want you to write only the dockerfile itself")
print(text1)
print("-------------------")
print(clean_response(text1))

In [None]:
text1 = generate_text(tokenizer, model, "Generate a Dockerfile of Python 2.7")
print(text1)
print("-------------------")
print(clean_response(text1))

In [None]:
text2 = generate_text(tokenizer, model, "Generate a Dockerfile of Wordpress 6.3.2, I want you to write only the dockerfile itself")
print(text2)
print("-------------------")
print(clean_response(text2))

In [None]:
text2 = generate_text(tokenizer, model, "Generate a Dockerfile of Wordpress 6.3.2")
print(text2)
print("-------------------")
print(clean_response(text2))

In [None]:
text3 = generate_text(tokenizer, model, "Generate a Dockerfile of Ruby 3.2.1, I want you to write only the dockerfile itself")
print(text3)
print("-------------------")
print(clean_response(text3))

In [None]:
text3 = generate_text(tokenizer, model, "Generate a Dockerfile of Ruby 3.2.1")
print(text3)
print("-------------------")
print(clean_response(text3))

In [None]:
model.push_to_hub("Tony177/{}".format(new_model))

In [None]:
tokenizer.push_to_hub("Tony177/{}".format(new_model))