In [1]:
!pip install pandas transformers[torch] datasets evaluate rouge_score POT torch peft bitsandbytes



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_NAME = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
icl_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
icl_model.to(device)

prompt = "The sneaky fox jumped over"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
output = icl_model.generate(input_ids, do_sample=True, max_length=30, top_p=0.9)
text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
torch.cuda.empty_cache()

In [2]:
from datasets import load_dataset
dataset = load_dataset("gpt3mix/sst2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from transformers import AutoTokenizer

MODEL_NAME = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
def tokenize(example):
    return tokenizer(example['text'], truncation=True, max_length=512, padding="max_length")
tokenized = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [4]:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

labels = ['POSITIVE', 'NEGATIVE']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = len(labels)
config.id2label = id2label
config.label2id = label2id
config.pad_token_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16)

lora_config = LoraConfig(
    #r=8,
    r=64,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ft_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, config=config)#, quantization_config = quantization_config)
ft_model.gradient_checkpointing_enable()
ft_model = prepare_model_for_kbit_training(ft_model)
ft_model = get_peft_model(ft_model, lora_config)
ft_model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MistralForSequenceClassification(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear

In [None]:
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir='mistral_lora_fine_tuned_classification',
    learning_rate=2e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
    gradient_accumulation_steps=4, bf16=True, save_total_limit=3,
    num_train_epochs=2, weight_decay=0.01, evaluation_strategy='epoch', save_strategy='epoch',
    load_best_model_at_end=True,)

trainer = Trainer(model=ft_model, args=training_args,
    train_dataset=tokenized['train'], eval_dataset=tokenized['test'],
    tokenizer=tokenizer, data_collator=collator, compute_metrics=compute_metrics,)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
