# BERT Fine Tuning with QQP

## 1. Import packages

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,BertForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric


## 2. load&split the first half of QQP for fine-tuning

In [4]:
# sampling the dataset for fine-tuning
train = load_dataset("quora", split = 'train[:35%]')     # classic 7/3 split
validation = load_dataset("quora", split = 'train[35%:50%]')

Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)
Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


In [5]:
# convert the dtype of 'is_duplicate' to int
new_features = train.features.copy()
new_features["is_duplicate"] = Value('int32')
train = train.cast(new_features)

new_features = validation.features.copy()
new_features["is_duplicate"] = Value('int32')
validation = validation.cast(new_features)

Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-0474a84a76505170.arrow
Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-387810a1332fde55.arrow


In [6]:
# take a look at an sample
train[0]

{'questions': {'id': [1, 2],
  'text': ['What is the step by step guide to invest in share market in india?',
   'What is the step by step guide to invest in share market?']},
 'is_duplicate': 0}

## 3. Encoding + Preprocessing

In [7]:
# load the tokenizer pretrained on bert-base
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [8]:
# encode the training dataset in the form of sentences pair
# truncate at length=64 for a balance of time consuming and information coverage
encoded_train = train.map(lambda batch: tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                  padding='max_length', truncation=True, max_length=64))
encoded_train.rename_column_("is_duplicate", "labels")

Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-c46265a14b9424db.arrow
  encoded_train.rename_column_("is_duplicate", "labels")


In [9]:
# encode the validation dataset in the form of sentences pair
encoded_validation = validation.map(lambda batch: tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                            padding='max_length', truncation=True, max_length=64))
encoded_validation.rename_column_("is_duplicate", "labels")

  0%|          | 0/60643 [00:00<?, ?ex/s]

In [9]:
encoded_train.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
encoded_validation.set_format("torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

In [10]:
# check if the format is valid, and if each tensor have the same size
print({key: val.shape for key, val in encoded_validation[0].items()})
print({key: val.shape for key, val in encoded_validation[1].items()})

{'labels': torch.Size([]), 'input_ids': torch.Size([64]), 'token_type_ids': torch.Size([64]), 'attention_mask': torch.Size([64])}
{'labels': torch.Size([]), 'input_ids': torch.Size([64]), 'token_type_ids': torch.Size([64]), 'attention_mask': torch.Size([64])}


In [11]:
# take a look at an encoded sample
encoded_train[1]

{'labels': tensor(0),
 'input_ids': tensor([  101,  1327,  1110,  1103,  1642,  1104, 19892, 21918,  1766,   113,
         19892,  1324,   118,   178,   118,  1302,  1766,   114,  8549,   136,
           102,  1327,  1156,  3333,  1191,  1103,  1890,  1433, 10566,  1103,
         19892, 21918,  1766,   113, 19892,  1324,   118,   178,   118,  1302,
          1766,   114,  9883,  1171,   136,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 

## 4. Fine-tune

In [12]:
# load a pretrained model

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
#model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
# set traning arguments manually if needed

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
)

In [13]:
# clean up gpu cache before training
import gc

gc.collect()

torch.cuda.empty_cache()

In [14]:
# train
trainer = Trainer(model=model, train_dataset=encoded_train, eval_dataset=encoded_validation)

# trainer.train()    
trainer.train(resume_from_checkpoint=True) # True if already trained, to save time by continuing on a checkpoint

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running training *****
  Num examples = 141502
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 53064


Step,Training Loss
500,0.5427
1000,0.4833
1500,0.4848
2000,0.4647
2500,0.4533
3000,0.4524
3500,0.4378
4000,0.4462
4500,0.4321
5000,0.4344


Saving model checkpoint to tmp_trainer\checkpoint-500
Configuration saved in tmp_trainer\checkpoint-500\config.json
Model weights saved in tmp_trainer\checkpoint-500\pytorch_model.bin
Saving model checkpoint to tmp_trainer\checkpoint-1000
Configuration saved in tmp_trainer\checkpoint-1000\config.json
Model weights saved in tmp_trainer\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to tmp_trainer\checkpoint-1500
Configuration saved in tmp_trainer\checkpoint-1500\config.json
Model weights saved in tmp_trainer\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to tmp_trainer\checkpoint-2000
Configuration saved in tmp_trainer\checkpoint-2000\config.json
Model weights saved in tmp_trainer\checkpoint-2000\pytorch_model.bin
Saving model checkpoint to tmp_trainer\checkpoint-2500
Configuration saved in tmp_trainer\checkpoint-2500\config.json
Model weights saved in tmp_trainer\checkpoint-2500\pytorch_model.bin
Saving model checkpoint to tmp_trainer\checkpoint-3000
Configuration

TrainOutput(global_step=53064, training_loss=0.33797427717600226, metrics={'train_runtime': 10693.7124, 'train_samples_per_second': 39.697, 'train_steps_per_second': 4.962, 'total_flos': 1.396152770833152e+16, 'train_loss': 0.33797427717600226, 'epoch': 3.0})

In [15]:
# save the fine_tuned model

model.save_pretrained("bert_qqp")

Configuration saved in bert_qqp\config.json
Model weights saved in bert_qqp\pytorch_model.bin
loading configuration file bert_qqp\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file bert_qqp\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

Al

## 5. Utilize the other half of QQP for evaluation 

In [10]:
# sampling the evaluation dataset from QQP
train_eval = load_dataset("quora", split = 'train[50%:85%]')
validation_eval = load_dataset("quora", split = 'train[85%:100%]')

Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)
Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


In [11]:
# repeat all the preprocessing steps above
new_features = train_eval.features.copy()
new_features["is_duplicate"] = Value('int32')
train_eval = train_eval.cast(new_features)

new_features = validation_eval.features.copy()
new_features["is_duplicate"] = Value('int32')
validation_eval = validation_eval.cast(new_features)

Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-58ce8a2d3d13bc51.arrow
Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-077de9dbe62a5b12.arrow


In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [13]:
# encode the training dataset in the form of sentences pair
encoded_train_eval = train_eval.map(lambda batch: tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                  padding='max_length', truncation=True, max_length=64))
encoded_train_eval.rename_column_("is_duplicate", "labels")

# encode the validation dataset in the form of sentences pair
encoded_validation_eval = validation_eval.map(lambda batch: tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                            padding='max_length', truncation=True, max_length=64))
encoded_validation_eval.rename_column_("is_duplicate", "labels")

Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-aec8ebd6b027502c.arrow


  0%|          | 0/60644 [00:00<?, ?ex/s]

## 6. Evaluation

In [24]:
model_qqp = AutoModelForSequenceClassification.from_pretrained("bert_qqp")

In [25]:
# load the metric module to compute accuracy
accuracy = load_metric("accuracy")

def compute_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [26]:
# load the metric module to compute f1 score
f1 = load_metric("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1.compute(predictions=predictions, references=labels)

### evaluate with accuracy

In [27]:
trainer_qqp = Trainer(
    model=model_qqp,    #use the fine-tuned model saved on
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_accuracy,
)

trainer_qqp.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8


{'eval_loss': 0.4974234700202942,
 'eval_accuracy': 0.8747279203218785,
 'eval_runtime': 2049.5477,
 'eval_samples_per_second': 29.589,
 'eval_steps_per_second': 3.699}

### evaluate with f1 score

In [28]:
trainer_qqp = Trainer(
    model=model_qqp,    #use the fine-tuned model saved on
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_f1,
)

trainer_qqp.evaluate() 

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8


{'eval_loss': 0.4974234700202942,
 'eval_f1': 0.829407405744055,
 'eval_runtime': 257.5134,
 'eval_samples_per_second': 235.498,
 'eval_steps_per_second': 29.439}