In [1]:
! pip install bitsandbytes transformers peft accelerate
! pip install datasets trl ninja packaging

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from tor

In [2]:
import torch
import os
import sys
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from trl import SFTTrainer

In [3]:
model_name = "Hugofernandez/Mistral-7B-v0.1-colab-sharded"
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [4]:
compute_dtype = getattr(torch, "float16")
print(compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = True,
)

print(bnb_config)

torch.float16
BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "float16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}



In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = {"": 0},
    use_flash_attention_2 = False,
)

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

pytorch_model-00001-of-00006.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00006.bin:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

pytorch_model-00003-of-00006.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00004-of-00006.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00005-of-00006.bin:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

pytorch_model-00006-of-00006.bin:   0%|          | 0.00/4.25G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [6]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [7]:
peft_config = LoraConfig(
    lora_alpha = 16,
    lora_dropout = 0.05,
    r = 16,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj', 'lm_head'],
)

In [8]:
# Casting the modules of the model to fp32
model = prepare_model_for_kbit_training(model)
model.config_pad_token_id = tokenizer.pad_token_id
model.config_use_cache = False

In [9]:
training_arguments = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    optim = "paged_adamw_8bit",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    log_level = "debug",
    save_steps = 500,
    logging_steps = 20,
    learning_rate = 4e-4,
    num_train_epochs = 1,
    warmup_steps = 100,
    lr_scheduler_type = "constant",
)



In [12]:
dataset = load_dataset('squad')

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [16]:
def convert_to_chat_format(example):
    question = example['question']
    context = example['context']
    answer = example['answers']['text'][0]  # Take the first answer
    chat_text = f"<|user|> {question} <|context|> {context} <|answer|> {answer} <|endoftext|>"
    return {'text': chat_text}

# Apply the conversion to the train and validation splits
squad_chat_format = dataset.map(convert_to_chat_format, remove_columns=['id', 'title', 'context', 'question', 'answers'])

print(squad_chat_format['train'][0])


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

{'text': '<|user|> To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? <|context|> Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. <|answer|> Saint Bernadette Soubirous <|endoftext|>'}


In [17]:
from sklearn.model_selection import train_test_split

# Convert the train dataset to a pandas DataFrame for easy manipulation
train_df = squad_chat_format['train'].to_pandas()
validation_df = squad_chat_format['validation'].to_pandas()

# Split the train data into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Save to CSV
train_df.to_csv('/content/train.csv', index=False)
validation_df.to_csv('/content/validation.csv', index=False)
test_df.to_csv('/content/test.csv', index=False)

In [18]:
data_files = {
    'train': "/content/train.csv",
    'validation': "/content/validation.csv",
    'test': "/content/test.csv"
}
dataset = load_dataset('csv', data_files=data_files)
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 70079
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 10570
    })
    test: Dataset({
        features: ['text'],
        num_rows: 17520
    })
})


In [19]:
print(data_files)

{'train': '/content/train.csv', 'validation': '/content/validation.csv', 'test': '/content/test.csv'}


In [20]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    peft_config = peft_config,
    dataset_text_field = "text",
    packing = True,
    max_seq_length = 512,
    tokenizer = tokenizer,
    args = training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]



In [21]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = model.num_parameters()
    for _, param in model.named_parameters():
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
print_trainable_parameters(model)

trainable params: 42520576 || all params: 7284252672 || trainable%: 0.583732853796316


In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 7643
  Batch size = 4


In [None]:
trainer.train()

In [None]:
#trainer.evaluate()
eval_prompt = """<s>[INST]What is a Neural Network, and how does it work?[/INST]"""

# import random
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))
model.train()

In [None]:
new_model = 'MistralAI_QLORA'
trainer.model.save_pretrained(new_model)

In [None]:
#Load the base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
peft_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = peft_model.merge_and_unload()

In [None]:
output_merged_dir = "/content/MistralAI_finetuned"

os.makedirs(output_merged_dir, exist_ok=True)
merged_model.save_pretrained(output_merged_dir, safe_serialization = False)
tokenizer.save_pretrained(output_merged_dir)