# Using SupCS for Training GLUE Tasks

## Dependencies

Install torch, tensorflow and SupCL-Seq packages using pip.

In [1]:
#!pip install datasets numpy 
#!pip install -U scikit-learn

In [2]:
from datasets import load_dataset, load_metric
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModel,AutoModelForSequenceClassification
#----for roberta-----#
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from sklearn.metrics import classification_report
import warnings
import numpy as np

from SupCL_Seq import SupCsTrainer

warnings.filterwarnings('ignore')

## GLUE Tasks

In [3]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]

task = "cola"
model_name = "bert-base-uncased"#"nghuyong/ernie-2.0-large-en"#"roberta-base" #"bert-base-uncased"

actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModel.from_pretrained(model_name)

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}


sentence1_key, sentence2_key = task_to_keys[task]

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

Reusing dataset glue (/home/hooman_sedghamiz/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Loading cached processed dataset at /home/hooman_sedghamiz/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db7

## Custom Metric

We employ a task dependent metric.

In [4]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [5]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
train_dataset = encoded_dataset["train"]
test_dataset = encoded_dataset[validation_key]


## Training Argument From Huggingface

In [6]:
CL_args = TrainingArguments(
        output_dir = './results',
        save_total_limit = 1,
        num_train_epochs=3,
        per_device_train_batch_size=32,  
        evaluation_strategy = 'no',
        logging_steps = 50,
        learning_rate = 5e-05,
        warmup_steps=50, 
        report_to ='tensorboard',
        weight_decay=0.01,               
        logging_dir='./logs',
    )

## SupCL-Trainer

This works exactly similar to the trainer from huggingface. We first CS train and save the model.


In [7]:
SupCL_trainer = SupCsTrainer.SupCsTrainer(
            w_drop_out= [0.0, 0.1, 0.2],
            temperature= 0.05,
            def_drop_out=0.1,
            pooling_strategy='pooler',
            model = model,
            args = CL_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

# Employing pooler ([CLS]) output.


In [8]:
SupCL_trainer.train()
SupCL_trainer.save_model('./cs_baseline')

The following columns in the training set  don't have a corresponding argument in `BertModel.forward` and have been ignored: idx, sentence.
***** Running training *****
  Num examples = 8551
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 201


Step,Training Loss
50,4.5533
100,4.4389
150,4.3768
200,4.2691




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./cs_baseline
Configuration saved in ./cs_baseline/config.json
Model weights saved in ./cs_baseline/pytorch_model.bin
tokenizer config file saved in ./cs_baseline/tokenizer_config.json
Special tokens file saved in ./cs_baseline/special_tokens_map.json


## Only FineTune a Linear Layer

After CS training we only add a linear layer and then finetune its weights only by freezing the pretrained model base parameters. Finally, finetune the linear layer on the data using cross entropy.

In [9]:
model_name = './cs_baseline'#"./results/checkpoint-500/"
num_labels = 3 if actual_task =='mnli' else 2
if actual_task =='stsb': num_labels = 1

#------ Add classification layer ---------#
#model = RobertaForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)
# ---- Freeze the base model -------#
for param in model.base_model.parameters():
                param.requires_grad = False

loading configuration file ./cs_baseline/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.10.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ./cs_baseline/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./cs_baseline and are newl

In [10]:
args = TrainingArguments(
        output_dir = './results',
        save_total_limit = 1,
        num_train_epochs=5,
        per_device_train_batch_size=28,  
        per_device_eval_batch_size=64,
        evaluation_strategy = 'epoch',
        logging_steps = 200,
        learning_rate = 1e-03,
        eval_steps = 200,
        warmup_steps=50, 
        report_to ='tensorboard',
        weight_decay=0.01,               
        logging_dir='./logs',
    )

PyTorch: setting up devices


In [11]:
trainer = Trainer(
            model,
            args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

In [12]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running training *****
  Num examples = 8551
  Num Epochs = 5
  Instantaneous batch size per device = 28
  Total train batch size (w. parallel, distributed & accumulation) = 112
  Gradient Accumulation steps = 1
  Total optimization steps = 385


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.547718,0.589244
2,No log,0.49107,0.616493
3,0.190200,0.489039,0.617881
4,0.190200,0.508434,0.599641
5,0.190200,0.498494,0.605467


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 256
The following columns in the evaluation set  don't have a corres

TrainOutput(global_step=385, training_loss=0.17174928343141233, metrics={'train_runtime': 50.2507, 'train_samples_per_second': 850.835, 'train_steps_per_second': 7.662, 'total_flos': 599215900808760.0, 'train_loss': 0.17174928343141233, 'epoch': 5.0})