In [1]:
%run library_utils.ipynb

import os 
import numpy as np 
import matplotlib.pyplot as plt
import torch 
import transformers
assert transformers.__version__ >= '4.17.'
from transformers import (
    AutoTokenizer, 
    AutoModel)
from tqdm import tqdm
from torch import Tensor
import torch.nn as nn

tqdm.pandas()

module = 'go-emotion-gru'
args = load_args(module)
init_seed(args.seed)

device: cuda


### Tokenizer & Encoder

In [2]:
tokenizer = AutoTokenizer.from_pretrained(args.encoder_name)
encoder = AutoModel.from_pretrained(args.encoder_name)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
X_tk = tokenize_bert_inputs(
  ['hello my name is jeongwon', 'what are you?'], 
  tokenizer, 
  maxlen=args.sentence_max_len
)
len(X_tk['input_ids'][0])

62

In [4]:
%%time
dataset_path = os.path.join(args.output_dir, args.train_dataset_path)
assert os.path.exists(dataset_path)

train_dataset, val_dataset = generate_bert_dataset(
    dataset_path,     
    tokenizer, 
    emotions = args.emotions, 
    sentence_max_len = args.sentence_max_len, 
    split = args.validation_split_from_train
)

Wall time: 5.99 s


In [5]:
class GoEmotionGRUClassifer(TransformerEncoderBase):
  def __init__(self, 
        encoder, 
        criterion, 
        seq_len=82,
        rnn_hidden = 50,  
        rnn_num_layers = 1,
        bidirectional=True, 
        hiddens = None,
        dropout_p=0.1, 
        n_cls:int = 28
    ):
    config = {
        'seq_len': seq_len, 
        'rnn_hidden': rnn_hidden, 
        'rnn_num_layers': rnn_num_layers, 
        'bidirectional': bidirectional, 
        'hiddens':  [50] if hiddens is None else hiddens, 
        'dropout_p': dropout_p, 
        'n_cls': n_cls
    }
    super().__init__(encoder, criterion, config)

    # layers
    self.encoder = encoder
    self.gru = nn.GRU(
        input_size= self.encoder_dim, 
        hidden_size = rnn_hidden, 
        batch_first = True, 
        bidirectional = bidirectional
    )
    self.dropout = nn.Dropout(dropout_p)

    # full connected
    fcs = []
    in_feature = (int(bidirectional) + 1) * rnn_hidden
    for h in self.hiddens:
      fcs.append(nn.Linear(in_feature, h))
      fcs.append(nn.ReLU())
      in_feature = h    
    
    # final layer 
    fcs.append(nn.Linear(in_feature, n_cls))
    self.fcs = nn.Sequential(*fcs)

  def forward(self, input_ids, attention_mask, y_true=None):
    encoder_output = super().forward(input_ids, attention_mask)
    contextual_emb = encoder_output['last_hidden_state']

    output, _ = self.gru(contextual_emb)
    output = output[:, -1, :]
    z = self.dropout(output) 
    logits = self.fcs(z)

    if not (y_true is None):
      loss = self.criterion(logits, y_true)
      return (loss, logits)

    return logits

In [6]:
def create_go_emotion_gru(args, encoder):
    return GoEmotionPoolClassifer(
        encoder, 
        seq_len=args.sentence_max_len,
        rnn_hidden=args.hidden, 
        rnn_num_layer = args.rnn_num_layers, 
        bidirectional=args.bidirectional, 
        hiddens = args.fc_hiddens, 
        dropout_p = args.dropout_p, 
        n_cls = len(args.emotions)
    )

model = create_go_emotion_gru(args, encoder)
model

GoEmotionPoolClassifer(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise

In [7]:
def proba_on_examples(texts, model):
    X_tk = tokenize_bert_inputs(
        texts, tokenizer, maxlen = args.sentence_max_len
    )
    X_tk['input_ids'] = Tensor(X_tk['input_ids']).type(torch.int32)
    X_tk['attention_mask'] = Tensor(X_tk['attention_mask']).type(torch.int32)
    proba = predict_proba_examples(X_tk, model)
    return proba

In [8]:
proba = proba_on_examples(['hello my name is jeongwon', 'nice to meet you!'], model)
proba_to_emotion(
    proba,
    args.classification_threshold, 
    args.emotions, 
)[0][:3]

[('admiration', 0.41870236), ('amusement', 0.45818296), ('anger', 0.49008417)]

In [9]:
proba = predict_proba_examples(val_dataset[:5], model)
y_true = val_dataset[:5]['y_true'].numpy()
compute_classification_metrics(y_true, proba, 0.1)

{'accuracy': 0.044444444444444446,
 'macro_precision': 0.044444444444444446,
 'macro_recall': 0.14814814814814814,
 'macro_f1': 0.0670194003527337,
 'micro_precision': 0.044444444444444446,
 'micro_recall': 1.0,
 'micro_f1': 0.0851063829787234,
 'weighted_precision': 0.3333333333333333,
 'weighted_recall': 1.0,
 'weighted_f1': 0.49206349206349215}

In [10]:
evaluate(
    model, 
    val_dataset, 
    batch_size = args.eval_batch_size, 
    threshold = args.classification_threshold, 
    device = args.device
)

                                                                

{'loss': 0.7004973216184939,
 'trigger_rate': 1.0,
 'accuracy': 0.04695688599307007,
 'macro_precision': 0.04695688599307007,
 'macro_recall': 1.0,
 'macro_f1': 0.0882997470479051,
 'micro_precision': 0.04695688599307007,
 'micro_recall': 1.0,
 'micro_f1': 0.0897016613029486,
 'weighted_precision': 0.0643399480933103,
 'weighted_recall': 1.0,
 'weighted_f1': 0.11955701106465137}

In [11]:
model = create_go_emotion_gru(args, encoder)
_ = save_checkpoint(
    model, 
    args.output_dir, 
    model_name='testmodel',
    metadata=args
)

model_copy, _ = load_from_checkpoint(
    args.output_dir,
    model_name='testmodel',
    checkpoint_id='null-model', 
    model_cls=GoEmotionPoolClassifer
)

check_model_same(model, model_copy)

True

In [12]:
# clear_archive(args.output_dir, args.model_name)

False

In [13]:
model = create_go_emotion_gru(args, encoder)
model, metadata = train(
    model, 
    train_dataset, 
    val_dataset, 
    dict(args), 
    tokenizer, 
    epochs=args.train_epochs, 
    train_batch_size = args.train_batch_size, 
    val_batch_size = args.eval_batch_size, 
    save_steps = args.save_steps, 
    validation_steps= args.validation_steps, 
    archive_dir = args.output_dir, 
    model_name = args.model_name, 
    classification_threshold = args.classification_threshold, 
    learning_rate = args.learning_rate, 
    grad_clip_max = args.grad_clip_max, 
    weight_decay = args.weight_decay, 
    warmup_ratio = args.warmup_ratio, 
    logging_metrics= ['macro_f1', 'macro_precision', 'macro_recall'],
    continue_training=False,
    device = args.device
)



hello
training epoch 0


Training:  49%|████▉     | 3005/6101 [00:48<12:36,  4.09it/s]

evaluating at step 3000
{'macro_precision': 0.03916243040566626, 'macro_recall': 0.49321678434501776, 'macro_f1': 0.047193015026207964}
saving at step 3000


Training:  98%|█████████▊| 6003/6101 [01:37<00:23,  4.12it/s]

evaluating at step 6000
{'macro_precision': 0.002095977877644066, 'macro_recall': 0.037037037037037035, 'macro_f1': 0.00396743314832052}
saving at step 6000


Training: 100%|██████████| 6101/6101 [01:38<00:00, 61.73it/s]


training epoch 1


Training:  48%|████▊     | 2900/6101 [00:46<12:51,  4.15it/s]

evaluating at step 9000
{'macro_precision': 0.0, 'macro_recall': 0.0, 'macro_f1': 0.0}
saving at step 9000


Training:  97%|█████████▋| 5904/6101 [01:34<00:47,  4.16it/s]

evaluating at step 12000
{'macro_precision': 0.0, 'macro_recall': 0.0, 'macro_f1': 0.0}
saving at step 12000


Training: 100%|██████████| 6101/6101 [01:37<00:00, 62.31it/s]


training epoch 2


Training:  46%|████▌     | 2791/6101 [00:39<00:46, 71.63it/s]

evaluating at step 15000
{'macro_precision': 0.0, 'macro_recall': 0.0, 'macro_f1': 0.0}
saving at step 15000


Training:  95%|█████████▌| 5801/6101 [01:34<01:15,  3.98it/s]

evaluating at step 18000
{'macro_precision': 0.0, 'macro_recall': 0.0, 'macro_f1': 0.0}
saving at step 18000


Training: 100%|██████████| 6101/6101 [01:39<00:00, 61.36it/s]


training epoch 3


Training:  44%|████▍     | 2703/6101 [00:45<14:59,  3.78it/s]

evaluating at step 21000
{'macro_precision': 0.054246165357276464, 'macro_recall': 0.002457270003201142, 'macro_f1': 0.004597415972211029}
saving at step 21000


Training:  93%|█████████▎| 5699/6101 [01:36<01:58,  3.38it/s]

evaluating at step 24000
{'macro_precision': 0.07957233689596498, 'macro_recall': 0.06681908705666403, 'macro_f1': 0.07249770679586148}
saving at step 24000


Training: 100%|██████████| 6101/6101 [01:42<00:00, 59.50it/s]


training epoch 4


Training:  43%|████▎     | 2602/6101 [00:43<14:45,  3.95it/s]

evaluating at step 27000
{'macro_precision': 0.11856450611110156, 'macro_recall': 0.08664715537240035, 'macro_f1': 0.09150671090514927}
saving at step 27000


Training:  92%|█████████▏| 5596/6101 [01:34<02:29,  3.39it/s]

evaluating at step 30000
{'macro_precision': 0.14216222918612698, 'macro_recall': 0.11133764910919053, 'macro_f1': 0.11966022935498678}
saving at step 30000


Training: 100%|██████████| 6101/6101 [01:42<00:00, 59.45it/s]


training epoch 5


Training:  41%|████      | 2494/6101 [00:35<00:51, 70.46it/s]

evaluating at step 33000
{'macro_precision': 0.1740575934838749, 'macro_recall': 0.11948390728426023, 'macro_f1': 0.12470979685692794}
saving at step 33000


Training:  90%|█████████ | 5495/6101 [01:32<02:31,  4.00it/s]

evaluating at step 36000
{'macro_precision': 0.2174040216642051, 'macro_recall': 0.12740190374117585, 'macro_f1': 0.14349548291920508}
saving at step 36000


Training: 100%|██████████| 6101/6101 [01:41<00:00, 60.22it/s]


training epoch 6


Training:  39%|███▉      | 2399/6101 [00:40<14:50,  4.16it/s]

evaluating at step 39000
{'macro_precision': 0.24208676827305095, 'macro_recall': 0.1444662653997408, 'macro_f1': 0.1634172999274588}
saving at step 39000


Training:  88%|████████▊ | 5396/6101 [01:28<02:50,  4.13it/s]

evaluating at step 42000
{'macro_precision': 0.2427148526809242, 'macro_recall': 0.14810749372551787, 'macro_f1': 0.16890591181697268}
saving at step 42000


Training: 100%|██████████| 6101/6101 [01:38<00:00, 61.69it/s]


training epoch 7


Training:  38%|███▊      | 2300/6101 [00:38<11:21,  5.57it/s]

evaluating at step 45000
{'macro_precision': 0.3479965672383879, 'macro_recall': 0.15639295652766677, 'macro_f1': 0.18544461725220285}
saving at step 45000


Training:  60%|█████▉    | 3633/6101 [00:58<00:36, 67.27it/s]

In [None]:
df = get_attrs_from_checkpoints_meta(args.output_dir, args.model_name, ['tr_loss', 'val_metrics'])
df

In [None]:
proba = predict_proba(
    model, 
    val_dataset, 
    batch_size = args.eval_batch_size, 
    device = args.device
)
y_true = val_dataset[:]['y_true'].numpy()

In [None]:
roc_auc_score(y_true, proba, average='macro')

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


for i, emt in enumerate(args.emotions):
    fpr, tpr, _ = roc_curve(y_true[:, i], proba[:, i])
    plt.plot(fpr, tpr)
    plt.title(emt)
    plt.show()


In [None]:
proba_to_emotion(
    proba_on_examples([
        'fuck cloud', 
        'yeet'
    ], model), 
    0.4, 
    args.emotions)