# BERT vs CL-BERT w/o Fine Tune

##  Import packages

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,BertForSequenceClassification
from datasets import load_dataset, ClassLabel, Value, load_metric

##  Load evaluation dataset + preprocessing

In [3]:
# sampling the dataset for evaluation
train_eval = load_dataset("quora", split = 'train[50%:85%]')
validation_eval = load_dataset("quora", split = 'train[85%:100%]')

# repeat all the preprocessing steps above
new_features = train_eval.features.copy()
new_features["is_duplicate"] = Value('int32')
train_eval = train_eval.cast(new_features)

new_features = validation_eval.features.copy()
new_features["is_duplicate"] = Value('int32')
validation_eval = validation_eval.cast(new_features)

Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)
Using custom data configuration default
Reusing dataset quora (C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)
Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-58ce8a2d3d13bc51.arrow
Loading cached processed dataset at C:\Users\52673\.cache\huggingface\datasets\quora\default\0.0.0\36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04\cache-077de9dbe62a5b12.arrow


## Load the metric module to compute accuracy and f1 score

In [4]:
accuracy = load_metric("accuracy")
f1 = load_metric("f1")

def compute_accuracy(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1.compute(predictions=predictions, references=labels)

# Model Comparison

## BERT

In [5]:
# load the tokenizer and model pretrained with bert
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = AutoModelForSequenceClassification.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [6]:
# encode the training dataset in the form of sentences pair
# truncate at length=64 for a balance of time consuming and information coverage
encoded_train_eval = train_eval.map(lambda batch: bert_tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                  padding='max_length', truncation=True, max_length=64))
encoded_train_eval.rename_column_("is_duplicate", "labels")

# encode the validation dataset in the form of sentences pair
encoded_validation_eval = validation_eval.map(lambda batch: bert_tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                            padding='max_length', truncation=True, max_length=64))
encoded_validation_eval.rename_column_("is_duplicate", "labels")

100%|██████████| 141501/141501 [00:33<00:00, 4170.02ex/s]
  encoded_train_eval.rename_column_("is_duplicate", "labels")
100%|██████████| 60644/60644 [00:14<00:00, 4287.61ex/s]


### Accuracy for BERT

In [7]:
trainer_bert = Trainer(
    model=bert,   
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_accuracy,
)

trainer_bert.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8
100%|██████████| 7581/7581 [04:28<00:00, 28.24it/s]


{'eval_loss': 0.6913312673568726,
 'eval_accuracy': 0.4584625024734516,
 'eval_runtime': 269.5438,
 'eval_samples_per_second': 224.988,
 'eval_steps_per_second': 28.125}

### F1 for BERT

In [8]:
trainer_bert = Trainer(
    model=bert,    
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_f1,
)

trainer_bert.evaluate()

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8
100%|██████████| 7581/7581 [04:27<00:00, 28.36it/s]


{'eval_loss': 0.6913312673568726,
 'eval_f1': 0.5665641621243517,
 'eval_runtime': 267.3371,
 'eval_samples_per_second': 226.845,
 'eval_steps_per_second': 28.357}

## CL-BERT

In [9]:
# load the tokenizer and model pretrained with CL-bert
cl_tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
cl = BertForSequenceClassification.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

loading configuration file https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased/resolve/main/config.json from cache at C:\Users\52673/.cache\huggingface\transformers\886dba277a27c6ab50ab3d0bfd8839d354cfeed717289623026415c62b687338.1b14bcddba43d86a607eedb4b638b87d30aa00c839358953dbd36f2cd3317c83
Model config BertConfig {
  "_name_or_path": "result/bert-base-uncased-cls_before_pooler-sym_mlp-mlp_bert-bs64-gpu8-gs1-lr5e-5-m=stsb-norm0.05-l32-contra",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 2,
  "use_cache": true,
  

In [10]:
encoded_train_eval = train_eval.map(lambda batch: cl_tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                  padding='max_length', truncation=True, max_length=64))
encoded_train_eval.rename_column_("is_duplicate", "labels")

# encode the validation dataset in the form of sentences pair
encoded_validation_eval = validation_eval.map(lambda batch: cl_tokenizer(batch['questions']['text'][0], batch['questions']['text'][1], 
                                                            padding='max_length', truncation=True, max_length=64))
encoded_validation_eval.rename_column_("is_duplicate", "labels")

100%|██████████| 141501/141501 [00:35<00:00, 3972.36ex/s]
100%|██████████| 60644/60644 [00:14<00:00, 4101.05ex/s]


### Accuracy for CL-BERT

In [11]:
trainer_cl = Trainer(
    model=cl,    
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_accuracy,
)

trainer_cl.evaluate()

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8
100%|██████████| 7581/7581 [04:30<00:00, 27.98it/s]


{'eval_loss': 0.7091943025588989,
 'eval_accuracy': 0.4153584855880219,
 'eval_runtime': 271.0187,
 'eval_samples_per_second': 223.763,
 'eval_steps_per_second': 27.972}

### F1 for CL-BERT

In [13]:
trainer_cl = Trainer(
    model=cl,   
    train_dataset=encoded_train_eval,
    eval_dataset=encoded_validation_eval,
    compute_metrics=compute_f1,
)

trainer_cl.evaluate()

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: questions.
***** Running Evaluation *****
  Num examples = 60644
  Batch size = 8
100%|██████████| 7581/7581 [04:43<00:00, 26.75it/s]


{'eval_loss': 0.7091943025588989,
 'eval_f1': 0.4639968554885331,
 'eval_runtime': 283.4258,
 'eval_samples_per_second': 213.968,
 'eval_steps_per_second': 26.748}