In [None]:
!pip install transformers
!pip install pyarrow
!pip install nlp
!pip install captum

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, AutoConfig,Trainer, TrainingArguments)
import nlp
from torch.utils.data import Dataset
from transformers import InputFeatures

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
train = pd.read_csv('YOUR_PATH_HERE.csv')
test = pd.read_csv('YOUR_PATH_HERE.csv')

train_tweets = train['tweet'].values.tolist()
train_labels = train['sarcastic'].values.tolist()
test_tweets = test['tweet'].values.tolist()
test_labels = test['sarcastic'].values.tolist()

In [None]:
MAX_LEN = 256

In [None]:
train_encodings = tokenizer(train_tweets,
                            max_length=MAX_LEN,
                            padding='max_length',
                            truncation=True,
                            add_special_tokens=True,
                            return_token_type_ids=True,
                            return_attention_mask=True,
                            return_tensors='pt')

In [None]:
test_encodings = tokenizer(test_tweets,
                            max_length=MAX_LEN,
                            padding='max_length',
                            truncation=True,
                            add_special_tokens=True,
                            return_token_type_ids=True,
                            return_attention_mask=True,
                            return_tensors='pt')

In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return InputFeatures(input_ids=self.encodings['input_ids'][idx],
                             token_type_ids=self.encodings['token_type_ids'][idx],
                             attention_mask=self.encodings['attention_mask'][idx],
                             label=self.labels[idx])     

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SarcasmDataset(train_encodings, train_labels)
test_dataset = SarcasmDataset(test_encodings, test_labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL,num_labels=2,ignore_mismatched_sizes=True)

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
  pred, labels = p
  pred = np.argmax(pred, axis=1)
  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  f1 = f1_score(labels, pred)
  return {"accuracy": accuracy,"f1_score":f1}

def labels(x):
  if x == 0:
    return 0
  else:
    return 1

In [None]:
training_args = TrainingArguments(
        output_dir='./res', 
        evaluation_strategy="steps", 
        num_train_epochs=15, 
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64, 
        warmup_steps=500, 
        weight_decay=0.01,
        logging_dir='./logs4',
        load_best_model_at_end=True,
    )

trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

In [None]:
trainer.train()

***** Running training *****
  Num examples = 6934
  Num Epochs = 15
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3255
  Number of trainable parameters = 124647170


Step,Training Loss,Validation Loss,Accuracy,F1 Score
500,0.4449,0.47885,0.837143,0.415385
1000,0.1318,1.135723,0.780714,0.440801
1500,0.0443,1.192952,0.842143,0.39782
2000,0.016,2.120894,0.735714,0.395425
2500,0.0081,1.954182,0.766429,0.415027
3000,0.0043,1.819857,0.787857,0.414201


***** Running Evaluation *****
  Num examples = 1400
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-500
Configuration saved in ./res/checkpoint-500/config.json
Model weights saved in ./res/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1000
Configuration saved in ./res/checkpoint-1000/config.json
Model weights saved in ./res/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1500
Configuration saved in ./res/checkpoint-1500/config.json
Model weights saved in ./res/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1400
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-2000
Configuration saved in ./res/checkpoint-2000/config.json
Model weights saved in ./res/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num exampl

TrainOutput(global_step=3255, training_loss=0.09987629979803082, metrics={'train_runtime': 4466.7646, 'train_samples_per_second': 23.285, 'train_steps_per_second': 0.729, 'total_flos': 1.36830904339968e+16, 'train_loss': 0.09987629979803082, 'epoch': 15.0})

In [None]:
!zip -r /content/res/checkpoint-500.zip /content/res/checkpoint-500

  adding: content/res/checkpoint-500/ (stored 0%)
  adding: content/res/checkpoint-500/pytorch_model.bin (deflated 7%)
  adding: content/res/checkpoint-500/optimizer.pt (deflated 29%)
  adding: content/res/checkpoint-500/config.json (deflated 49%)
  adding: content/res/checkpoint-500/scheduler.pt (deflated 51%)
  adding: content/res/checkpoint-500/trainer_state.json (deflated 54%)
  adding: content/res/checkpoint-500/rng_state.pth (deflated 27%)
  adding: content/res/checkpoint-500/training_args.bin (deflated 48%)


In [None]:
!cp /content/res/checkpoint-500.zip /content/drive/MyDrive

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1400
  Batch size = 64


{'eval_loss': 0.36462289094924927,
 'eval_accuracy': 0.86,
 'eval_f1_score': 0.4869109947643979,
 'eval_runtime': 19.6822,
 'eval_samples_per_second': 71.13,
 'eval_steps_per_second': 1.118,
 'epoch': 1.0}

In [None]:
model.roberta.embeddings

RobertaEmbeddings(
  (word_embeddings): Embedding(50265, 768, padding_idx=1)
  (position_embeddings): Embedding(514, 768, padding_idx=1)
  (token_type_embeddings): Embedding(1, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
!nvidia-smi

Thu Oct 13 05:52:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    42W /  70W |  10986MiB / 15109MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients, LayerActivation, Saliency
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer

from captum.attr import IntegratedGradients
from captum.attr import InterpretableEmbeddingBase, TokenReferenceBase
from captum.attr import visualization
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer

In [None]:
torch.cuda.empty_cache()

In [None]:
inp = test.loc[0]

text = inp.tweet
label = inp.sarcastic

#text

In [None]:
text_encoding = tokenizer(text,
                          add_special_tokens=True,
                          return_token_type_ids=True,
                          return_attention_mask=True,
                          return_tensors='pt')

In [None]:
input_ids  = text_encoding['input_ids'].to(device)
seq_length = text_encoding['input_ids'].size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=device)

In [None]:
input_ids.shape

torch.Size([1, 36])

In [None]:
seq_length

36

In [None]:
position_ids #created a serial number array which tells the position of each word in the sentence. Since all our ids are padded to length 256.
#position_id array [0,255]

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
       device='cuda:0')

In [None]:
attention_mask = text_encoding['attention_mask'].to(device)
token_type_ids = text_encoding['token_type_ids'].to(device)
position_ids   = position_ids.to(device)

In [None]:
indices = input_ids[0].detach().tolist()
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [None]:
def construct_input_ref(text, ref_token_id, sep_token_id, cls_token_id):
  text_ids = tokenizer.encode(text, add_special_tokens=False, max_length=seq_length-2, padding='max_length')
  input_ids = [cls_token_id] + text_ids + [sep_token_id]
  # construct reference token ids 
  ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]
  # pad_length = MAX_LEN - len(ref_input_ids)
  #ref_input_ids = ref_input_ids + [0] * pad_length
  return torch.tensor([ref_input_ids], device=device)

In [None]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence
ref_input_ids = construct_input_ref(text, ref_token_id, sep_token_id, cls_token_id)
ref_input_ids.shape, input_ids.shape

(torch.Size([1, 36]), torch.Size([1, 36]))

In [None]:
def predict(input_ids, attention_mask=None):
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
  preds = torch.softmax(outputs.logits, dim = 1)[0][0].unsqueeze(0)
  #preds = torch.softmax(outputs.logits, dim = 1)[0][1].unsqueeze(0)
  return preds

In [None]:
#y = model(input_ids = input_ids,attention_mask = attention_mask)

In [None]:
#y.logits

In [None]:
#preds = torch.softmax(y.logits, dim = 1)[0][1].unsqueeze(0)

In [None]:
lig = LayerIntegratedGradients(predict,model.roberta.embeddings)

In [None]:
attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    additional_forward_args=(attention_mask,),
                                    #target=0,
                                    return_convergence_delta=True,
                                    n_steps=50,
                                   )

In [None]:
attributions

tensor([[[-0.0000e+00, -0.0000e+00,  0.0000e+00,  ..., -0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 2.1797e-07, -2.9970e-07, -2.2429e-07,  ...,  5.6559e-08,
           9.7332e-09,  4.6044e-07],
         [ 1.7535e-07, -1.5113e-07, -4.9487e-07,  ...,  2.4076e-07,
           1.8991e-08, -2.4612e-07],
         ...,
         [ 8.1658e-09,  5.2299e-08, -6.0060e-07,  ..., -1.2934e-07,
           8.9210e-09,  1.2955e-07],
         [ 2.4507e-07, -6.1021e-07, -7.0642e-07,  ..., -8.0882e-09,
           9.3991e-09, -3.7831e-07],
         [ 8.1607e-08, -6.0190e-08, -1.8893e-07,  ..., -2.4137e-07,
           5.3947e-07,  3.9380e-08]]], device='cuda:0', dtype=torch.float64)

In [None]:
delta

tensor([-0.0557], device='cuda:0', dtype=torch.float64)

In [None]:
def summarize_attributions(attributions):
  attributions = attributions.sum(dim=-1).squeeze(0) #3D nunchi 1D chesthunam
  attributions = attributions / torch.norm(attributions)
  return attributions

In [None]:
#(attributions.sum(dim=-1).squeeze(0)).shape

In [None]:
attributions_sum = summarize_attributions(attributions)
attributions_sum.shape

torch.Size([36])

In [None]:
score = predict(input_ids, attention_mask)
score.shape

torch.Size([1])

In [None]:
score_vis = viz.VisualizationDataRecord(attributions_sum,
                                        torch.max(torch.softmax(score[0], dim=0)),
                                        torch.argmax(score[0]),  # revise this, not sure about it
                                        torch.argmax(score[0]),  # revise this, not sure about it
                                        text,
                                        attributions_sum.sum(),       
                                        all_tokens,
                                        delta)

print('\033[1m', 'Visualization For Score', '\033[0m')
viz.visualize_text([score_vis])

[1m Visualization For Score [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (1.00),"Size on the the Toulouse team, That pack is monstrous. I can't see a Welsh region ever winning this, Money talks as they say .",0.28,"#s Size Ġon Ġthe Ġthe ĠT oul ouse Ġteam , ĠThat Ġpack Ġis Ġmonstrous . Ġ Ċ Ċ I Ġcan 't Ġsee Ġa ĠWelsh Ġregion Ġever Ġwinning Ġthis , ĠMoney Ġtalks Ġas Ġthey Ġsay Ġ. #/s"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,0 (1.00),"Size on the the Toulouse team, That pack is monstrous. I can't see a Welsh region ever winning this, Money talks as they say .",0.28,"#s Size Ġon Ġthe Ġthe ĠT oul ouse Ġteam , ĠThat Ġpack Ġis Ġmonstrous . Ġ Ċ Ċ I Ġcan 't Ġsee Ġa ĠWelsh Ġregion Ġever Ġwinning Ġthis , ĠMoney Ġtalks Ġas Ġthey Ġsay Ġ. #/s"
,,,,


In [None]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')

In [None]:
tokenizer.decode(input_ids)

TypeError: ignored