In [1]:
# %pip install -q -U transformers
# %pip install -q -U accelerate
# %pip install -q -U bitsandbytes

In [2]:
# Accessing, quantizing, fine-tuning, merging, and saving mistral-7b.

## Mistral 7B Inference

Creating a 4-bit quantization with NF4-type configuration using BitsAndBytes to load the model in 4-bit precision. It will help load the model faster and reduce the memory footprint so that it can be run on Google Colab Kaggle Notebook, or consumer GPUs.

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

Loading the model and tokenizer using the transformer library.

In [4]:
model_name = "mistralai/Mistral-7B-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Building the pipeline (to generate a response based on the prompt).

In [5]:
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer = tokenizer, 
    torch_dtype=torch.bfloat16, 
    device_map="auto"
)

Sample inference.

In [13]:
prompt = "As a data scientist, can you explain the concept of regularization in machine learning?"

sequences = pipe(
    prompt,
    do_sample=True,
    max_new_tokens=100, 
    temperature=0.7, 
    top_k=50, 
    top_p=0.95,
    num_return_sequences=1,
)
print(sequences[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


As a data scientist, can you explain the concept of regularization in machine learning?

Answer: In machine learning, regularization is a technique used to prevent overfitting in the model and improve generalization. Overfitting occurs when a model fits too closely to the training data, resulting in poor performance on new, unseen data. Regularization adds a penalty term to the loss function, which encourages simpler models that generalize better. This can be achieved through various techniques such as L1/L2 regularization, dropout, and early stopping.



## Mistral 7B Fine-tuning

In [14]:
# %%capture
# %pip install -U bitsandbytes
# %pip install -U transformers
# %pip install -U peft
# %pip install -U accelerate
# %pip install -U trl

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import load_dataset
from trl import SFTTrainer

In [16]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
HUGGING_FACE_FINE_TUNING_WRITE_ACCESS_TOKEN = os.getenv("HUGGING_FACE_FINE_TUNING_WRITE_ACCESS_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

In [17]:
!huggingface-cli login --token $HUGGING_FACE_FINE_TUNING_WRITE_ACCESS_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Token is valid (permission: write).
Your token has been saved to /home/marx/.cache/huggingface/token
Login successful


In [18]:
wandb.login(key = WANDB_API_KEY)
run = wandb.init(
    project='Fine-tuning Mistral-7B-Chapterizaton', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mmajestic-shawarma[0m ([33mnesvier[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/marx/.netrc


In [20]:
base_model = "mistralai/Mistral-7B-v0.1"
dataset_name = "MajesticShawarma/chapterization-v0.1"
new_model = "mistral_7b_chapterizaton_4bit_v0.1"

In [22]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset["items"][100]

{'cacheHit': False,
 'durationMs': 13242,
 'input': {'max_tokens': 4096,
  'messages': [{'content': 'You are tasked with extracting relevant information or identification from the following key-value pairs. Given a piece of text, chapterize and generate a JSON format as output. Follow the instructions below to help you in generating the output:\n\n1. **Understand the Key-Value Structure:**\n   - A key-value pair consists of:\n      - `\'summary\'`: A top-level overview or description of the chunk. Must not be empty (i.e., required).\n      - `\'headers\'`: A list of headers where a header is a line of text that introduces a new section or chapter in a document. It is typically formatted distinctly from the main body text to stand out, often being bolder, in a larger font, or differently styled. Headers are concise, summarizing the content that follows, and they guide the reader through the document\'s structure. Strictly, this is a list of strings. Must not be empty (i.e., required).\n

In [24]:
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        # load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [26]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)