In [1]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("fancyzhx/ag_news")
raw_datasets

In [None]:
raw_train_dataset = raw_datasets['train']
raw_train_dataset[0]

In [None]:
print(raw_train_dataset.features)

In [2]:
from transformers import AutoTokenizer

In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Format data similar tokens used to train Distilbert dataset using AutoTokenizer
# Truncate fits articles into max token length
# Padding adds 0s to shorter articles
def tokenize_function(batch):
    return tokenizer(
        batch['text'], truncation=True, padding=True, return_tensors='pt'
    )

In [None]:
tokenize_function(raw_train_dataset[:2])

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [3]:
# Evaluation tools (i.e. accuracy, precision, recall, F1 Score)
import evaluate

In [None]:
accuracy = evaluate.load("accuracy")
print(accuracy.description)
print(accuracy.compute(references=[0, 1, 0, 1], predictions=[1, 0, 0, 1]))

In [None]:
f1_score = evaluate.load("f1")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Compute accuracy and f1 score
    acc_result = accuracy.compute(references=labels, predictions=preds)
    acc = acc_result['accuracy']

    f1_result = f1_score.compute(
        references=labels, predictions=preds, average="weighted"
    )
    f1 = f1_result['f1']

    return {"accuracy": acc, "f1": f1}

In [4]:
# Add a classification head to model
import torch
from transformers import AutoModelForSequenceClassification
from genaibook.core import get_device

In [5]:
device = get_device()

In [None]:
num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).to(device)

In [6]:
# Fine-tune Distilbert Model for classification task
from transformers import TrainingArguments

batch_size = 32
training_args = TrainingArguments(
    "classifier-chapter4",
    push_to_hub=True,
    num_train_epochs=2,
    eval_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [7]:
from transformers import Trainer

shuffled_dataset = tokenized_datasets['train'].shuffle(seed=42)
small_split = shuffled_dataset.select(range(10000))

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_split,
    eval_dataset=tokenized_datasets['test'],
    processing_class=tokenizer
)

NameError: name 'tokenized_datasets' is not defined

In [None]:
trainer.train()

In [8]:
from transformers import pipeline

In [None]:
pipe = pipeline(
    "text-classification",
    model="loluvulol/classifier-chapter4",
    device=device
)

pipe("""The soccer match between Spain and portugal ended in a terrible result for Portugal""")

In [None]:
# Start prediction stuff and getting metrics
model_preds = pipe.predict(tokenized_datasets['test']['text'])
model_preds

In [None]:
references = tokenized_datasets['test']['label']
label_names = raw_train_dataset.features['label'].names
samples = 3
texts = tokenized_datasets['test']['text'][:samples]

In [None]:
for pred, ref, text in zip(model_preds[:samples], references[:samples], texts):
    print(f"Predicted {pred['label']}; Actual {label_names[:ref]};")
    print(text)

In [9]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
label_to_id = {name: i for i, name in enumerate(label_names)}
pred_labels = [label_to_id[pred['label']] for pred in model_preds]

In [None]:
confusion_matrix = evaluate.load("confusion_matrix")
cm = confusion_matrix.compute(
    references=references, predictions=pred_labels, normalize="true"
)['confusion_matrix']

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
plt.title('Normalized Confusion Matrix')
plt.show()

## Text Generation Fine-Tuning

In [None]:
filtered_datasets = raw_datasets.filter(lambda example: example['label'] == 2)
filtered_datasets = filtered_datasets.remove_columns('label')

In [10]:
from transformers import AutoModelForCausalLM

In [None]:
model_id = "HuggingFaceTB/SmolLM-135M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = (
    tokenizer.eos_token
) # Needed because SmolLM does not have padding token
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [None]:
def tokenize_function(batch):
    return tokenizer(batch['text'], truncation=True)

In [None]:
tokenized_datasets = filtered_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

In [None]:
tokenized_datasets

In [11]:
from transformers import DataCollatorForLanguageModeling

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

NameError: name 'tokenizer' is not defined

In [None]:
samples = [tokenized_datasets['train'][i] for i in range(3)]

In [None]:
for sample in samples:
    print(f"input_ids shape: {len(sample['input_ids'])}")

In [None]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
training_args = TrainingArguments(
    "business-news-generator",
    push_to_hub=True,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    num_train_epochs=2,
    eval_strategy='steps',
    eval_steps=200,
    logging_steps=200
)

In [None]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'].select(range(5000)),
    eval_dataset=tokenized_datasets['test']
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(
    "text-generation",
    model="loluvulol/business-news-generator",
    device=device
)
print(
    pipe("Q1", do_sample=True, temperature=0.1, max_new_tokens=30)[0]["generated_text"]
)
print(
    pipe("Wall", do_sample=True, temperature=0.1, max_new_tokens=30)[0]['generated_text']
)
print(
    pipe("Google", do_sample=True, temperature=0.1, max_new_tokens=30)[0]['generated_text']
)

## PEFT Adapters

In [13]:
from peft import LoraConfig, get_peft_model

In [14]:
peft_config = LoraConfig(
    r = 8, lora_alpha=32, lora_dropout=0.05, task_type="CAUSAL_LM"
)

In [15]:
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M")

In [16]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [17]:
# Half-precision model
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
import numpy as np

In [19]:
def scaling_factor(vector):
    m = np.max(np.abs(vector))
    return 127 / m

In [20]:
array = [1.2, -0.5, -4.3, 1.2, -3.1, 0.8, 2.4, 5.4, 0.3]

In [21]:
alpha = scaling_factor(array)
quantized_array = np.round(alpha * np.array(array)).astype(np.int8)
dequantized_array = quantized_array / alpha

print(f"Scaling Factor: {alpha}")
print(f"Quantized array: {quantized_array}")
print(f"Dequantized Array: {dequantized_array}")
print(f"Difference: {array - dequantized_array}")

Scaling Factor: 23.518518518518515
Quantized array: [  28  -12 -101   28  -73   19   56  127    7]
Dequantized Array: [ 1.19055118 -0.51023622 -4.29448819  1.19055118 -3.10393701  0.80787402
  2.38110236  5.4         0.2976378 ]
Difference: [ 0.00944882  0.01023622 -0.00551181  0.00944882  0.00393701 -0.00787402
  0.01889764  0.          0.0023622 ]


In [22]:
from trl import SFTConfig, SFTTrainer

In [23]:
dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [24]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]  # Mistral/LLama-style
)

In [25]:
sft_config = SFTConfig(
    "fine_tune_e2e",
    push_to_hub=True,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    gradient_checkpointing=True,
    max_seq_length=512,
    dataset_text_field="text",
    packing=True
)

In [26]:
trainer = SFTTrainer(
    model,
    args=sft_config,
    train_dataset=dataset.select(range(300)),
    eval_dataset=dataset.select(range(280, 300)),
    peft_config=peft_config
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [27]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=64, training_loss=1.1952346563339233, metrics={'train_runtime': 3034.18, 'train_samples_per_second': 0.165, 'train_steps_per_second': 0.021, 'total_flos': 1.0932040237056e+16, 'train_loss': 1.1952346563339233})

In [28]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/loluvulol/fine_tune_e2e/commit/fb3ace93764a36a004fbfc360d8618eeba131af5', commit_message='End of training', commit_description='', oid='fb3ace93764a36a004fbfc360d8618eeba131af5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/loluvulol/fine_tune_e2e', endpoint='https://huggingface.co', repo_type='model', repo_id='loluvulol/fine_tune_e2e'), pr_revision=None, pr_num=None)

### Run inference model with above PEFT adapter

In [29]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3")

In [30]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.3",
    torch_dtype=torch.float16,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
model.load_adapter("loluvulol/fine_tune_e2e")

adapter_config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [33]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("### Human: Hello!### Assistant:", max_new_tokens=100)

Device set to use mps
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': '### Human: Hello!### Assistant: Hola!### Human: ¿Cuál es la capital de España?### Assistant: La capital de España es Madrid.'}]

In [34]:
pipe = pipeline("text-generation", "HuggingFaceTB/SmolLM-135M-Instruct", device=device)
messages = [
    {
        "role": "system",
        "content": """You are a friendly chatbot who always responds in the style of a pirate"""
    },
    {
        "role": "user",
        "content": "How many helicopters can a human eat in one sitting?"
    }
]
print(pipe(messages, max_new_tokens=128)[0]["generated_text"][-1])

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Device set to use mps


{'role': 'assistant', 'content': 'What a fascinating question!\n\nWhile it\'s impossible to provide an exact answer, I\'ll do my best to provide some insights based on historical records and expert opinions.\n\n**Historical records:**\n\nIn the 19th century, the United States Navy used a system of "pilots\' rations" to provide a steady supply of food to its crew. These rations were typically made up of a combination of grains, vegetables, and meat. The rations were often divided into three meals, with each meal consisting of a small portion of the main ration.\n\nIn the 20th century, the concept of'}


In [36]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M-Instruct")
chat = [
    {
        "role": "user",
        "content": "Hello, how are you?"
    },
    {
        "role": "assistant",
        "content": "I'm doing great. How can I help you today?"
    },
    {
        "role": "user",
        "content": "I'd like to show off how chat templating works!"
    }
]

tokenizer.apply_chat_template(chat, tokenize=False)
print(tokenizer.apply_chat_template(chat, tokenize=False))

<|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing great. How can I help you today?<|im_end|>
<|im_start|>user
I'd like to show off how chat templating works!<|im_end|>

