In [33]:
import os 
import re
import numpy as np 
import random
import pandas as pd
import preprocessor
import torch 
import shutil
from transformers import (
    AutoTokenizer, 
    AutoModel , 
    AdamW, get_linear_schedule_with_warmup)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from torch import Tensor 
from torch.utils.data import TensorDataset, Dataset, DataLoader
from attrdict import AttrDict


assert torch.cuda.is_available()
tqdm.pandas()

input_dir = '../inputs'
output_dir = '../outputs'

In [34]:
args = AttrDict({
    'model_name': 'go-emotion-tiny',
    'encoder_name': 'prajjwal1/bert-tiny',
    'encoder_dim': 128, 
    'fc_hiddens': [50, 50], 
    'dropout_p': 0.1, 
    'sentence_max_len': 50,
    'train_epochs': 5, 
    'train_batch_size': 32, 
    'eval_batch_size': 32, 
    'learning_rate': 0.0001, 
    'grad_clip_max': 1.0,
    'weight_decay': 1e-4, 
    'warmup_ratio': 1e-1, 
    'classification_threshold': 0.5,
    'validation_split_from_train': 0.2, 
    'test_split': 0.2,
    'validation_steps': 3e4, 
    'save_steps': 3e4, 
    'dataset_source_path': os.path.join(input_dir, 'go-emotions-google-emotions-dataset', 'go_emotions_dataset.csv'), 
    'checkpoint_dir': os.path.join(output_dir, 'checkpoints'),
    'train_dataset_path': os.path.join(output_dir, 'train_set_compact.csv'), 
    'test_dataset_path': os.path.join(output_dir, 'test_set_compact.csv'),
    'seed': 0, 
    'device': 'cuda' if torch.cuda.is_available() else 'cpu', 
    'drop_insignifiant': True, 
    'emotions': ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 
                 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 
                 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 
                 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 
                 'pride', 'realization', 'relief', 'remorse', 'sadness', 
                 'surprise'], # 'neutral' 
})

In [35]:
def init_seed(args):
  torch.manual_seed(args.seed)
  random.seed(args.seed)
  np.random.seed(args.seed)
  return

init_seed(args)

In [36]:
tokenizer = AutoTokenizer.from_pretrained(args.encoder_name)
encoder = AutoModel.from_pretrained(args.encoder_name)

def tokenizing_input(texts, tokenizer, maxlen=50):
    result = tokenizer.batch_encode_plus(
            texts, 
          return_attention_mask=True, 
        return_token_type_ids=False,
        padding='longest', 
        max_length=maxlen)
    return result['input_ids'], result['attention_mask']

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
class ZipDataset(Dataset):
  def __init__(self, datasets):
    super(ZipDataset, self).__init__()
    self.keys = list(datasets.keys())
    self.values = list(datasets.values()) 
    self.datasets = datasets
    assert all([len(self.values[0]) == len(v) for v in self.values])

  def __len__(self):
    return len(self.values[0])
  
  def __getitem__(self, idx):
    item = {}
    for k, v in self.datasets.items():
      item[k] = v[idx]
    return item

def generate_dataset(args, tokenizer):
  Dtr = pd.read_csv(args.train_dataset_path)
  X = Dtr['text'].to_numpy()
  y = Dtr[list(args.emotions)].to_numpy()

  Xtr, Xval, ytr, yval = train_test_split(X, y, 
                                          test_size=args.validation_split_from_train, 
                                          shuffle=False)

  Xtr_tk, Xtr_mask= tokenizing_input(Xtr.tolist(), tokenizer, 
                                     maxlen=args.sentence_max_len)
  Xval_tk, Xval_mask = tokenizing_input(Xval.tolist(), tokenizer, 
                                        maxlen=args.sentence_max_len)

  train_dataset = ZipDataset({
      'input_ids': Tensor(Xtr_tk).type(torch.int32), 
      'attention_mask': Tensor(Xtr_mask).type(torch.int32), 
      'y_true': Tensor(ytr).type(torch.float32)
  })

  val_dataset = ZipDataset({
      'input_ids': Tensor(Xval_tk).type(torch.int32), 
      'attention_mask': Tensor(Xval_mask).type(torch.int32), 
      'y_true': Tensor(yval).type(torch.float32)
  })

  return train_dataset, val_dataset

In [38]:
%%time
train_dataset, val_dataset = generate_dataset(args, tokenizer)

Wall time: 8.57 s


In [39]:
def compute_classification_metrics(y_true, proba, threshold):
  assert len(y_true) == len(proba), 'y_true and y_pred length mismatch {} {}'.format(len(preds), len(labels))

  results = {}
  y_true = y_true.astype(int)
  y_pred = (proba >= threshold).astype(int)

  results["accuracy"] = (y_true == y_pred).mean()
  if (np.unique(y_true) == 1).sum() == 0:
    results["auc_roc_macro"] = roc_auc_score(y_true, proba, average='macro')
    results["auc_roc_micro"] = roc_auc_score(y_true, proba, average='micro')
  results["macro_precision"], results["macro_recall"], results["macro_f1"], _ = precision_recall_fscore_support(y_true, y_pred, average="macro")
  results["micro_precision"], results["micro_recall"], results["micro_f1"], _ = precision_recall_fscore_support(y_true, y_pred, average="micro")
  results["weighted_precision"], results["weighted_recall"], results["weighted_f1"], _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")

  return results

In [40]:
class GoEmotionClassiferWithGRU(torch.nn.Module):
  def __init__(self, 
               encoder,
               encoder_dim=128, 
               seq_len=82,
               rnn_hidden = 50,  
               rnn_num_layers = 1,
               bidirectional=True, 
               hiddens = None,
               dropout_p=0.1, 
               n_cls:int = 28, 
               criterion = torch.nn.BCEWithLogitsLoss()
               ):
    super(GoEmotionClassiferWithGRU, self).__init__()

    self.encoder_dim = encoder_dim
    self.rnn_hidden = rnn_hidden
    self.rnn_num_layers = rnn_num_layers
    self.seq_len = seq_len
    self.bidirectional = bidirectional
    self.hiddens = [100] if hiddens is None else hiddens 
    self.dropout_p = dropout_p 
    self.n_cls = n_cls
    self.criterion = criterion

    # layers
    self.encoder = encoder
    self.gru = torch.nn.GRU(
        input_size= encoder_dim, 
        hidden_size = rnn_hidden, 
        batch_first = True, 
        bidirectional = bidirectional
    )
    self.dropout = torch.nn.Dropout(dropout_p)

    # full connected
    fcs = []
    in_feature = (int(bidirectional) + 1) * rnn_hidden
    for h in self.hiddens:
      fcs.append(torch.nn.Linear(in_feature, h))
      fcs.append(torch.nn.ReLU())
      in_feature = h    
    fcs.append(torch.nn.Linear(in_feature, n_cls))
    self.fcs = torch.nn.Sequential(*fcs)

  def forward(self, X_tk, X_mask, y_true=None):
    encoder_output = self.encoder(X_tk, X_mask)
    contextual_emb = encoder_output['last_hidden_state'] # contextual embedding
    output, _ = self.gru(contextual_emb)
    output = output[:, -1, :]
    z = self.dropout(output) 
    logits = self.fcs(z)

    if not (y_true is None):
      loss = self.criterion(logits, y_true)
      return (loss, logits)
    return logits

  def save_pretrained(self, path):
    encoder_path = os.path.join(path, 'encoder')
    pt_path = os.path.join(path, 'model.pt')
    encoder = self.encoder
    encoder.save_pretrained(encoder_path)
    self.encoder = None
    torch.save({
        'model': self.state_dict(), 
        'config': {
            'encoder': encoder.config.to_dict(), 
            'criterion': self.criterion.__class__.__name__, 
            'architecture': str(self), 
            'rnn_hidden': self.rnn_hidden,
            'rnn_num_layers': self.rnn_num_layers,
            'seq_len': self.seq_len,
            'bidirectional': self.bidirectional,
            'hiddens': self.hiddens, 
            'dropout_p': self.dropout_p, 
            'n_cls': self.n_cls
        }
    }, pt_path)
    self.encoder = encoder
    return

  @classmethod
  def from_pretrained(cls, path):
    pt_path = os.path.join(path, 'model.pt')
    encoder_path = os.path.join(path, 'encoder')
    saved_model = torch.load(pt_path, map_location=torch.device('cpu'))
    encoder = AutoModel.from_pretrained(encoder_path)
    model = cls(
        encoder=None,
        encoder_dim = saved_model['config']['encoder']['hidden_size'], 
        seq_len = saved_model['config']['seq_len'],
        rnn_hidden = saved_model['config']['rnn_hidden'],  
        rnn_num_layers = saved_model['config']['rnn_num_layers'],
        bidirectional= saved_model['config']['bidirectional'], 
        hiddens = saved_model['config']['hiddens'],
        dropout_p = saved_model['config']['dropout_p'],
        n_cls = saved_model['config']['n_cls'],
        criterion = getattr(torch.nn, saved_model['config']['criterion'])()
    )
    model.load_state_dict(saved_model['model'])
    model.encoder = encoder
    return model 

class GoEmotionClassifer(torch.nn.Module):
  def __init__(self, 
               encoder,
               encoder_dim=128, 
               hiddens = None,
               dropout_p=0.1, 
               n_cls:int = 28, 
               criterion = torch.nn.BCEWithLogitsLoss()
               ):
    super(GoEmotionClassifer, self).__init__()

    self.encoder_dim = encoder_dim
    self.hiddens = [100] if hiddens is None else hiddens 
    self.dropout_p = dropout_p 
    self.n_cls = n_cls
    self.criterion = criterion

    # layers
    self.encoder = encoder
    self.dropout = torch.nn.Dropout(dropout_p)

    # full connected
    fcs = []
    in_feature = encoder_dim 
    for h in self.hiddens:
      fcs.append(torch.nn.Linear(in_feature, h))
      fcs.append(torch.nn.ReLU())
      in_feature = h    
    fcs.append(torch.nn.Linear(in_feature, n_cls))
    self.fcs = torch.nn.Sequential(*fcs)

  def forward(self, X_tk, X_mask, y_true=None):
    encoder_output = self.encoder(X_tk, X_mask) # contextual embedding
    pooled_output = encoder_output['pooler_output'] 
    z = self.dropout(pooled_output) 
    logits = self.fcs(z)

    if not (y_true is None):
      loss = self.criterion(logits, y_true)
      return (loss, logits)
    return logits

  def save_pretrained(self, path):
    encoder_path = os.path.join(path, 'encoder')
    pt_path = os.path.join(path, 'model.pt')
    encoder = self.encoder
    encoder.save_pretrained(encoder_path)
    self.encoder = None
    torch.save({
        'model': self.state_dict(), 
        'config': {
            'encoder': encoder.config.to_dict(), 
            'criterion': self.criterion.__class__.__name__, 
            'architecture': str(self), 
            'hiddens': self.hiddens, 
            'dropout_p': self.dropout_p, 
            'n_cls': self.n_cls
        }
    }, pt_path)
    self.encoder = encoder
    return

  @classmethod
  def from_pretrained(cls, path):
    pt_path = os.path.join(path, 'model.pt')
    encoder_path = os.path.join(path, 'encoder')
    saved_model = torch.load(pt_path, map_location=torch.device('cpu'))
    encoder = AutoModel.from_pretrained(encoder_path)
    model = cls(
        encoder=None,
        encoder_dim = saved_model['config']['encoder']['hidden_size'], 
        hiddens = saved_model['config']['hiddens'],
        dropout_p = saved_model['config']['dropout_p'],
        n_cls = saved_model['config']['n_cls'],
        criterion = getattr(torch.nn, saved_model['config']['criterion'])()
    )
    model.load_state_dict(saved_model['model'])
    model.encoder = encoder
    return model 


In [41]:
def proba_on_examples(X, model, tokenizer=None):
  if tokenizer is not None:
    X_tk, X_mask = tokenizing_input(X, tokenizer)
    X_tk = torch.Tensor(X_tk).type(torch.int32)
    X_mask = torch.Tensor(X_mask).type(torch.int32)
  else:
    X_tk, X_mask = X['input_ids'], X['attention_mask']

  with torch.no_grad():
    logits = model(X_tk, X_mask).numpy()
    proba = 1 / (1 + np.exp(-logits))

  return proba

def proba_to_emotion(proba, threshold, emotions):
  assert proba.shape[-1] == len(emotions), 'emotions and proba mismatch {} vs {}'.format(len(emotions), proba.shape[-1])
  emotions = np.array(emotions)
  return [tuple(emotions[p >= threshold]) for p in proba]

In [42]:
def create_model_from_config(args, encoder):
  return GoEmotionClassifer(
    encoder, 
    encoder_dim = args.encoder_dim, 
    hiddens = args.fc_hiddens, 
    dropout_p = args.dropout_p, 
    n_cls = len(args.emotions)
  )

def create_rnn_model_from_config(args, encoder):
  return GoEmotionClassiferWithGRU(
      encoder, 
      encoder_dim = args.encoder_dim, 
      seq_len = args.seq_len, 
      rnn_hidden = args.rnn_hidden, 
      rnn_num_layers = args.rnn_num_layers,
      bidirectional = args.bidirectional, 
      hiddens = args.fc_hiddens, 
      dropout_p = args.dropout_p, 
      n_cls = len(args.emotions)
  )

test_args = args.copy()
test_args['seq_len'] = 82
test_args['rnn_hidden'] = 50
test_args['rnn_num_layers'] = 1
test_args['bidirectional'] = True
test_args['hiddens'] = [50, 50]
test_args = AttrDict(test_args)
# model = create_model_from_config(args, encoder)
model = create_rnn_model_from_config(test_args, encoder)
proba = proba_on_examples(['hello my name is jeongwon', 'its me jeongwon'],  model, tokenizer)
proba_to_emotion(proba, args.classification_threshold, emotions=args.emotions)

[('admiration',
  'annoyance',
  'caring',
  'confusion',
  'curiosity',
  'desire',
  'excitement',
  'gratitude',
  'nervousness',
  'optimism',
  'realization',
  'relief',
  'remorse'),
 ('admiration',
  'annoyance',
  'caring',
  'confusion',
  'curiosity',
  'desire',
  'excitement',
  'gratitude',
  'nervousness',
  'optimism',
  'pride',
  'relief',
  'remorse')]

In [43]:
proba = proba_on_examples(val_dataset[:5], model)
y_true = val_dataset[:5]['y_true'].numpy()
compute_classification_metrics(y_true, proba, 0.1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.044444444444444446,
 'macro_precision': 0.044444444444444446,
 'macro_recall': 0.14814814814814814,
 'macro_f1': 0.0670194003527337,
 'micro_precision': 0.044444444444444446,
 'micro_recall': 1.0,
 'micro_f1': 0.0851063829787234,
 'weighted_precision': 0.3333333333333333,
 'weighted_recall': 1.0,
 'weighted_f1': 0.49206349206349215}

In [44]:
def evaluate(model, 
             dataset, 
             batch_size=16, 
             threshold=0.5,
             device='cpu', back_to_cpu=True):
  eval_dataloader = DataLoader(
      dataset, 
      batch_size=batch_size, 
  )

  n_batch = 0
  total_loss = 0.0
  y_true = []
  proba = []

  model.to(device)

  for batch in tqdm(eval_dataloader, desc='evaluation', leave=False):
    model.eval()
    batch = { k:v.to(device) for k, v in batch.items() }

    with torch.no_grad():
      loss_per_batch, logits = model(batch['input_ids'], 
                               batch['attention_mask'], 
                               batch['y_true'])
      total_loss += loss_per_batch.item()

      logits = logits.cpu().detach().numpy()

    p = 1 / (1 + np.exp(-logits))
    proba.append(p)
    y_true.append(batch['y_true'].cpu().detach().numpy())

    n_batch += 1

  if back_to_cpu:
    model.cpu()

  proba = np.vstack(proba)
  y_true = np.vstack(y_true)
  results = {
      'loss': total_loss / n_batch, 
      'trigger_rate': (proba >= threshold).mean(), 
      **compute_classification_metrics(y_true, proba, threshold)
  }

  return results

In [45]:
evaluate(
    model, 
    val_dataset, 
    batch_size = args.eval_batch_size, 
    threshold = args.classification_threshold, 
    device = args.device
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'loss': 0.6957503297069595,
 'trigger_rate': 0.4488762007099916,
 'accuracy': 0.5494345716449556,
 'macro_precision': 0.03751222110651815,
 'macro_recall': 0.4515383851132274,
 'macro_f1': 0.05788837281625718,
 'micro_precision': 0.05042332192753486,
 'micro_recall': 0.4820129933094153,
 'micro_f1': 0.09129616308789543,
 'weighted_precision': 0.053205492346381784,
 'weighted_recall': 0.4820129933094153,
 'weighted_f1': 0.08328313484048709}

In [46]:
def save_checkpoint(
    model, 
    archive_dir, 
    model_name, 
    checkpoint_id="?", 
    metadata=None,
    tokenizer=None, 
    optimizer=None, 
    scheduler=None,
):
  # create archive folder
  archive_path = os.path.join(archive_dir, model_name)
  if not os.path.exists(archive_path):
    os.makedirs(archive_path, exist_ok=True)

  # create checkpoint folder
  checkpoint_dir = os.path.join(archive_path, 'checkpoint-%s' % str(checkpoint_id))
  os.makedirs(checkpoint_dir, exist_ok=True)

  # save model in checkpoint
  model_to_save = (model.module if hasattr(model, "module") else model)
  model_to_save.save_pretrained(checkpoint_dir)
  if tokenizer is not None:
    tokenizer.save_pretrained(checkpoint_dir)
  if metadata:
    torch.save(metadata, os.path.join(checkpoint_dir, "meta.bin"))
  if scheduler is not None:
    torch.save(scheduler.state_dict(), os.path.join(checkpoint_dir, 'scheduler.pt'))
  if optimizer is not None:
    torch.save(optimizer.state_dict(), os.path.join(checkpoint_dir, 'optimizer.pt'))

  return archive_path

def load_from_checkpoint(
    archive_dir, 
    model_name, 
    checkpoint_id="?", 
    load_tokenizer=False, 
    load_metadata=True, 
    load_optimizer=False, 
    cls = GoEmotionClassifer
):
  archive_path = os.path.join(archive_dir, model_name)
  checkpoint_dir = os.path.join(archive_path, 'checkpoint-%s' % str(checkpoint_id))

  assert os.path.exists(archive_path), archive_path
  assert os.path.exists(checkpoint_dir), checkpoint_dir

  model = getattr(cls, 'from_pretrained')(
      checkpoint_dir
  )

  output = (model, )
  if load_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
    output += (tokenizer, )

  if load_metadata or load_optimizer:
    metadata = torch.load(os.path.join(checkpoint_dir, 'meta.bin'))
    if load_metadata:
      output += (metadata, )

  if load_optimizer:
      grouped_parameters = [{'params': [param for name, param in model.named_parameters() \
                                          if not any(nd in name for nd in ('bias', 'LayerNorm.weight'))]}, 
                            {'params': [param for name, param in model.named_parameters() \
                                        if any(nd in name for nd in ('bias', 'LayerNorm.weight'))]}]

      optimizer = AdamW(grouped_parameters, 
                    lr=metadata['learning_rate'], 
                    weight_decay=metadata['weight_decay']) 
      
      scheduler = get_linear_schedule_with_warmup(
          optimizer,
          num_warmup_steps=int(metadata['train_max_step'] * metadata['warmup_ratio']),
          num_training_steps=metadata['train_max_step']
      )
      optimizer.load_state_dict(torch.load(os.path.join(checkpoint_dir, 'optimizer.pt')))
      scheduler.load_state_dict(torch.load(os.path.join(checkpoint_dir, 'scheduler.pt')))

      output += (optimizer, scheduler)
  return output


In [47]:
args = args.copy()
args['model_name'] = 'go-emotion-tiny-gru'
args['seq_len'] = 82
args['rnn_hidden'] = 50
args['rnn_num_layers'] = 1
args['bidirectional'] = True
args['hiddens'] = [25],
args['validation_steps'] = 1.5e3
args['save_steps'] = 1.5e3
args['train_batch_size'] = 32
args['eval_batch_size'] = 32
args['weight_decay'] = 0
args = AttrDict(args)

In [55]:
tokenizer

PreTrainedTokenizerFast(name_or_path='prajjwal1/bert-tiny', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [48]:
model, metadata = load_from_checkpoint(
    '', 
    'go-emotion-tiny-gru', 
    checkpoint_id= '6000', 
    cls=GoEmotionClassiferWithGRU    
)

In [49]:
metadata['tr_loss']

0.4861164281045397

In [None]:
proba_to_emotion

In [65]:
proba_to_emotion(proba_on_examples(['hello I am very confused right now?'], model, tokenizer), 0.15, args.emotions)

[('admiration', 'approval', 'love')]

In [53]:
evaluate(
    model, 
    val_dataset, 
    batch_size = args.eval_batch_size, 
    threshold = args.classification_threshold, 
    device = args.device
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'loss': 0.20021999056108034,
 'trigger_rate': 0.0,
 'accuracy': 0.95304311400693,
 'macro_precision': 0.0,
 'macro_recall': 0.0,
 'macro_f1': 0.0,
 'micro_precision': 0.0,
 'micro_recall': 0.0,
 'micro_f1': 0.0,
 'weighted_precision': 0.0,
 'weighted_recall': 0.0,
 'weighted_f1': 0.0}

In [51]:
# model, metadata = load_from_checkpoint(
#     output_dir, 
#     'go-emotion-tiny', 
#     checkpoint_id='450000', 
# )
# proba = proba_on_examples(['what the fuck is happening?', 
#                            'I am not sure', 
#                            'good job ! cool'], 
#                   model, tokenizer = tokenizer)

# proba_to_emotion(proba, 0.5, args.emotions)