# Chapter 11 - Fine-tuning Representation Models for Classification

## Data

In [6]:
from datasets import load_dataset

tomatoes = load_dataset("rotten_tomatoes")
train_data, test_data = tomatoes["train"], tomatoes["test"]

## Supervised Classification

### HuggingFace Trainer

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Tokenize the inputs
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [9]:
# Define metrics
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    load_f1 = evaluate.load("f1")
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"f1": f1}

In [10]:
# Train model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [6]:
trainer.train()

  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.3903, 'grad_norm': 15.705974578857422, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 46.3531, 'train_samples_per_second': 184.022, 'train_steps_per_second': 11.52, 'train_loss': 0.3865355516640881, 'epoch': 1.0}


TrainOutput(global_step=534, training_loss=0.3865355516640881, metrics={'train_runtime': 46.3531, 'train_samples_per_second': 184.022, 'train_steps_per_second': 11.52, 'total_flos': 213940121334480.0, 'train_loss': 0.3865355516640881, 'epoch': 1.0})

In [None]:
trainer.evaluate()

### Freeze layers

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
for name, param in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [10]:
for name, param in model.named_parameters():
    if name.startswith('classifier'):
        param.requires_grad = True
    else:
        param.requires_grad = False

In [11]:
for name, param in model.named_parameters():
    print(f"Parameter: {name} ---- {param.requires_grad}")

Parameter: bert.embeddings.word_embeddings.weight ---- False
Parameter: bert.embeddings.position_embeddings.weight ---- False
Parameter: bert.embeddings.token_type_embeddings.weight ---- False
Parameter: bert.embeddings.LayerNorm.weight ---- False
Parameter: bert.embeddings.LayerNorm.bias ---- False
Parameter: bert.encoder.layer.0.attention.self.query.weight ---- False
Parameter: bert.encoder.layer.0.attention.self.query.bias ---- False
Parameter: bert.encoder.layer.0.attention.self.key.weight ---- False
Parameter: bert.encoder.layer.0.attention.self.key.bias ---- False
Parameter: bert.encoder.layer.0.attention.self.value.weight ---- False
Parameter: bert.encoder.layer.0.attention.self.value.bias ---- False
Parameter: bert.encoder.layer.0.attention.output.dense.weight ---- False
Parameter: bert.encoder.layer.0.attention.output.dense.bias ---- False
Parameter: bert.encoder.layer.0.attention.output.LayerNorm.weight ---- False
Parameter: bert.encoder.layer.0.attention.output.LayerNorm.bia

In [12]:
from transformers import TrainingArguments, Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.6986, 'grad_norm': 3.4565298557281494, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 13.3265, 'train_samples_per_second': 640.078, 'train_steps_per_second': 40.071, 'train_loss': 0.697593588954054, 'epoch': 1.0}


TrainOutput(global_step=534, training_loss=0.697593588954054, metrics={'train_runtime': 13.3265, 'train_samples_per_second': 640.078, 'train_steps_per_second': 40.071, 'total_flos': 213940121334480.0, 'train_loss': 0.697593588954054, 'epoch': 1.0})

### Freeze blocks 1-10

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

model_id = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Encoder block 10 starts at index 165 and we will freeze all layers before it
for index, (name, param) in enumerate(model.named_parameters()):
    if index < 165:
        param.requires_grad = False

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/534 [00:00<?, ?it/s]

{'loss': 0.4517, 'grad_norm': 2.768611431121826, 'learning_rate': 1.2734082397003748e-06, 'epoch': 0.94}
{'train_runtime': 18.5949, 'train_samples_per_second': 458.727, 'train_steps_per_second': 28.718, 'train_loss': 0.4488348192936472, 'epoch': 1.0}


TrainOutput(global_step=534, training_loss=0.4488348192936472, metrics={'train_runtime': 18.5949, 'train_samples_per_second': 458.727, 'train_steps_per_second': 28.718, 'total_flos': 213940121334480.0, 'train_loss': 0.4488348192936472, 'epoch': 1.0})

### MLM (Masked Language Model)

In [14]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load model for Masked Language Modeling (MLM)
model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
def preprocess_function(examples):
    # Tokenize the inputs
    return tokenizer(examples['text'], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)
tokenized_train = tokenized_train.remove_columns(["label"])
tokenized_test = tokenized_test.remove_columns(["label"])

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [17]:
# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [18]:
# Save pre-trained tokenizer
tokenizer.save_pretrained("mlm")

# Train model
trainer.train()

# Save updated model
model.save_pretrained("mlm")

  0%|          | 0/5340 [00:00<?, ?it/s]

{'loss': 2.6029, 'grad_norm': 13.746612548828125, 'learning_rate': 1.812734082397004e-05, 'epoch': 0.94}
{'loss': 2.3769, 'grad_norm': 16.20442771911621, 'learning_rate': 1.6254681647940076e-05, 'epoch': 1.87}
{'loss': 2.3035, 'grad_norm': 23.52048683166504, 'learning_rate': 1.4382022471910113e-05, 'epoch': 2.81}
{'loss': 2.1871, 'grad_norm': 13.83089542388916, 'learning_rate': 1.250936329588015e-05, 'epoch': 3.75}
{'loss': 2.148, 'grad_norm': 13.555903434753418, 'learning_rate': 1.0636704119850187e-05, 'epoch': 4.68}
{'loss': 2.0923, 'grad_norm': 23.861364364624023, 'learning_rate': 8.764044943820226e-06, 'epoch': 5.62}
{'loss': 2.0568, 'grad_norm': 15.416306495666504, 'learning_rate': 6.891385767790263e-06, 'epoch': 6.55}
{'loss': 1.9921, 'grad_norm': 14.122122764587402, 'learning_rate': 5.0187265917603005e-06, 'epoch': 7.49}
{'loss': 1.9847, 'grad_norm': 19.902162551879883, 'learning_rate': 3.146067415730337e-06, 'epoch': 8.43}
{'loss': 1.9646, 'grad_norm': 15.873626708984375, 'lear

In [19]:
from transformers import pipeline

# Load and create predictions
mask_filler = pipeline("fill-mask", model="bert-base-cased")
preds = mask_filler("What a horrible [MASK]!")

# Print results
for pred in preds:
    print(f">>> {pred['sequence']}")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


>>> What a horrible idea!
>>> What a horrible dream!
>>> What a horrible thing!
>>> What a horrible day!
>>> What a horrible thought!


In [20]:
mask_filter = pipeline('fill-mask', model='mlm')
preds = mask_filter("What a horrible [MASK]!")

# Print results
for pred in preds:
    print(f">>> {pred['sequence']}")

>>> What a horrible movie!
>>> What a horrible film!
>>> What a horrible mess!
>>> What a horrible story!
>>> What a horrible comedy!


## Named Entity Recognition (NER)

In [21]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import numpy as np

In [22]:
# The CoNLL-2003 dataset for NER
dataset = load_dataset("conll2003", trust_remote_code=True)

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [23]:
example = dataset["train"][848]
example

{'id': '848',
 'tokens': ['Dean',
  'Palmer',
  'hit',
  'his',
  '30th',
  'homer',
  'for',
  'the',
  'Rangers',
  '.'],
 'pos_tags': [22, 22, 38, 29, 16, 21, 15, 12, 23, 7],
 'chunk_tags': [11, 12, 21, 11, 12, 12, 13, 11, 12, 0],
 'ner_tags': [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]}

In [24]:
label2id = {
    'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4,
    'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8
}
id2label = {index: label for label, index in label2id.items()}

In [25]:
from transformers import AutoModelForAudioClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
token_ids = tokenizer(example["tokens"], is_split_into_words=True)["input_ids"]
sub_tokens = tokenizer.convert_ids_to_tokens(token_ids)
sub_tokens

['[CLS]',
 'Dean',
 'Palmer',
 'hit',
 'his',
 '30th',
 'home',
 '##r',
 'for',
 'the',
 'Rangers',
 '.',
 '[SEP]']

In [28]:
def align_labels(examples):
    token_ids = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = examples["ner_tags"]

    updated_labels = []
    for index, label in enumerate(labels):
        word_ids = token_ids.word_ids(batch_index=index) # Map tokens to their respective words
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx != previous_word_idx:
                previous_word_idx = word_idx
                updated_label = -100 if word_idx is None else label[word_idx]
                label_ids.append(updated_label)
            elif not word_idx:
                label_ids.append(-100)
            else:
                updated_label = label[word_idx]
                if updated_label % 2 == 1:
                    updated_label += 1
                label_ids.append(updated_label)
        updated_labels.append(label_ids)
    token_ids["labels"] = updated_labels
    return token_ids

tokenized = dataset.map(align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [29]:
# Difference between original and updated labels
print(f"Original: {example['ner_tags']}")
print(f"Updated: {tokenized['train'][848]['labels']}")

Original: [1, 2, 0, 0, 0, 0, 0, 0, 3, 0]
Updated: [-100, 1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, -100]


In [31]:
from transformers import DataCollatorForTokenClassification

# Token-classification Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [32]:
# Training arguments for parameter tuning
training_args = TrainingArguments(
   "model",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=1,
   weight_decay=0.01,
   save_strategy="epoch",
   report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/878 [00:00<?, ?it/s]

{'loss': 0.2158, 'grad_norm': 1.0152201652526855, 'learning_rate': 8.610478359908885e-06, 'epoch': 0.57}
{'train_runtime': 77.3608, 'train_samples_per_second': 181.5, 'train_steps_per_second': 11.349, 'train_loss': 0.1555034576624564, 'epoch': 1.0}


TrainOutput(global_step=878, training_loss=0.1555034576624564, metrics={'train_runtime': 77.3608, 'train_samples_per_second': 181.5, 'train_steps_per_second': 11.349, 'total_flos': 351240792638148.0, 'train_loss': 0.1555034576624564, 'epoch': 1.0})

In [None]:
trainer.evaluate()

In [35]:
from transformers import pipeline

# Save our fine-tuned model
trainer.save_model("ner_model")

# Run inference on the fine-tuned model
token_classifier = pipeline(
    "token-classification",
    model="ner_model",
)
token_classifier("Nhan is a student.")

[{'entity': 'B-PER',
  'score': 0.99016213,
  'index': 1,
  'word': 'N',
  'start': 0,
  'end': 1},
 {'entity': 'I-PER',
  'score': 0.9897619,
  'index': 2,
  'word': '##han',
  'start': 1,
  'end': 4}]