## Add simple customized layer on top of pretrained models

- we start with adding a simple classification layer - let's try use twitter emo data for base layer and train a sentiment classification layer 
- follow https://jovian.ai/rajbsangani/emotion-tuned-sarcasm
- and follw : https://github.com/huggingface/transformers/blob/v4.20.1/src/transformers/models/bert/modeling_bert.py#L1510

In [1]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from datasets import load_metric
import torch
import torch.nn as nn
import pandas as pd
import config
import numpy as np
import os

In [2]:
## Load data 
data=load_dataset("json",data_files="/media/chengyu/Elements1/HuggingFace/Data/Sarcasm/Sarcasm_Headlines_Dataset.json")
data=data.rename_column("is_sarcastic","label")
data=data.remove_columns(['article_link'])
data

Using custom data configuration default-c4cc714ae0f5b0ee
Reusing dataset json (/home/chengyu/.cache/huggingface/datasets/json/default-c4cc714ae0f5b0ee/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 26709
    })
})

In [3]:
## Simple data processing
data.set_format('pandas')
data=data['train'][:]
data.drop_duplicates(subset=['headline'],inplace=True)
data=data.reset_index()[['headline','label']]
data=Dataset.from_pandas(data)
# 80% train, 20% test + validation
train_testvalid = data.train_test_split(test_size=0.2,seed=15)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)
# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

data

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 21281
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2661
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2660
    })
})

- ### Load pretrained model

In [4]:
checkpoint = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

- Tokenize Data

In [5]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
tokenized_dataset



  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 21281
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2661
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2660
    })
})

In [6]:
## formate for classifciation 
tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
test_data = tokenized_dataset['train'][0:2]
test_data

{'label': tensor([0, 0]),
 'input_ids': tensor([[    0,   627,    78,  1368,   947,   119,  3023, 17832, 17894,   637,
           3156,    32,  1747,   259,     2],
         [    0, 30975,  9709,  8253, 21000,     6,   151, 17029,     9,   514,
              7, 12280,  9932,  1680,     2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, 
                                                                               output_attentions=True,
                                                                               output_hidden_states=True))
model_output = model(input_ids=test_data['input_ids'],attention_mask = test_data['attention_mask'])
model_output.keys()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])

In [9]:
print(model_output['last_hidden_state'].size())
print(model_output['pooler_output'].size())
print('cls token size = {}'.format(model_output['last_hidden_state'][:,0,:].size()))

torch.Size([2, 15, 768])
torch.Size([2, 768])
cls token size = torch.Size([2, 768])


### Set up customized model layer

In [10]:
class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.model = model = AutoModel.from_pretrained(checkpoint,
                                                   config=AutoConfig.from_pretrained(checkpoint, 
                                                                                     output_attentions=True,
                                                                                     output_hidden_states=True))
    self.dropout = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    ## you can either use pooled output; it is more common for classification tasks
    logits = self.classifier(outputs[1])#.view(-1,768))  1 is pooled output 
    
#     ## or use CLS token 
#     sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state
#     logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
    return TokenClassifierOutput(loss=loss, logits=logits)#, hidden_states=outputs.hidden_states,attentions=outputs.attentions)
## to play well with trainer, out put must not have a token level dimension 
## https://discuss.huggingface.co/t/passing-the-tokenizer-to-trainer-for-bucketing-does-not-work-for-evaluation-set/1687/3




In [11]:
model=CustomModel(checkpoint=checkpoint,num_labels=2)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [12]:
training_args = TrainingArguments(output_dir=os.path.join(config.data_folder,"test_trainer"),
#                                    evaluation_strategy="epoch",
                                   evaluation_strategy="steps",
                                   eval_steps=500,
                                   logging_steps =500,          ## show eval results
                                   learning_rate=5e-5,
                                   per_device_train_batch_size=8,
                                   per_device_eval_batch_size=8,
                                   num_train_epochs=5,
                                   weight_decay=0.4, ## wd regularizor, usually a very small number as additional weight penality
                                   save_steps=500,
                                   load_best_model_at_end=True, ## only save and load best model
                                   save_total_limit = 1,        ## only save one checkpoint
                                   seed=42)  
## load evaluation matric
metric = load_metric("accuracy")
## define a evaluation function : here it is a simple accuracy 
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the training set don't have a corresponding argument in `CustomModel.forward` and have been ignored: headline. If headline are not expected by `CustomModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21281
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6655
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mchuang16[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss,Accuracy
500,0.3407,0.409014,0.902292
1000,0.2879,0.20122,0.926343
1500,0.2152,0.305983,0.918452
2000,0.1659,0.219458,0.92484
2500,0.1604,0.350654,0.92484
3000,0.0996,0.405315,0.922585
3500,0.0904,0.339696,0.919579
4000,0.0717,0.333219,0.937242
4500,0.0417,0.344398,0.933484
5000,0.0332,0.37033,0.936866


The following columns in the evaluation set don't have a corresponding argument in `CustomModel.forward` and have been ignored: headline. If headline are not expected by `CustomModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2661
  Batch size = 16
Saving model checkpoint to /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [/media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-1000] due to args.save_total_limit
Deleting older checkpoint [/media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set don't have 

Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-4000/special_tokens_map.json
Deleting older checkpoint [/media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-3500] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `CustomModel.forward` and have been ignored: headline. If headline are not expected by `CustomModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2661
  Batch size = 16
Saving model checkpoint to /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-4500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /media/chengyu/Elements1/HuggingFace/test_trainer/checkpoint-4500/tokenize

TrainOutput(global_step=6655, training_loss=0.11749523188457016, metrics={'train_runtime': 2415.1824, 'train_samples_per_second': 44.057, 'train_steps_per_second': 2.755, 'total_flos': 0.0, 'train_loss': 0.11749523188457016, 'epoch': 5.0})