In [1]:
%%capture
! pip install transformers
! pip install sentencepiece

In [2]:
import numpy as np
import string
from nltk.tokenize import WordPunctTokenizer
from string import digits, ascii_lowercase, punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel, DistilBertModel
import torch
from gensim.models import KeyedVectors
import pickle
import nltk
import json

from collections import Counter

import pandas as pd

from tqdm import tqdm
import transformers
from transformers import BertModel, AutoTokenizer, BertTokenizer, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup, DistilBertTokenizer, DistilBertModel
import torch.nn.functional as F

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader

## getting data

In [3]:
%%capture
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/data/train_extended.csv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/data/train_extended_2.csv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/data/messages_test_features_ready_for_WS_2022.tsv

! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_dev.tsv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_train.tsv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/goldstandard_dev_2022.tsv

In [37]:
all_data_train = pd.read_csv('/content/train_extended.csv')
goldstandard = pd.read_csv('goldstandard_dev_2022.tsv', sep="\t", names=[str(i) for i in range(12)])

all_data_val = pd.read_csv('messages_dev.tsv', sep="\t")
x_val = all_data_val['essay']
y_val = goldstandard[['2']]  # only emotion label column
y_val.columns = ['emotion']
# x_val['emotion'] = y_val

x_train = all_data_train['essay'] # leave columns related to this track
y_train = all_data_train[['emotion']] # only emotion label column

In [38]:
test_df = pd.read_csv('/content/messages_test_features_ready_for_WS_2022.tsv', sep="\t")
test_df = test_df['essay']

In [39]:
em2id = {'neutral': 0, 
         'sadness': 1,
         'anger': 2, 
         'fear': 3, 
         'surprise': 4, 
         'disgust': 5, 
         'joy': 6
}

id2em = {i:word for word, i in em2id.items()}

y_val = np.array(y_val['emotion'].map(em2id))
y_train = np.array(y_train['emotion'].map(em2id))

## bert-base

In [40]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [41]:
MAX_LEN = 160
RANDOM_SEED = 1
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

train_encodings_bert = tokenizer(x_train.tolist(), truncation=True, padding=True)
val_encodings_bert = tokenizer(x_val.tolist(), truncation=True, padding=True)

del train_encodings_bert['token_type_ids']
del val_encodings_bert['token_type_ids']

In [43]:
class GPReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)

train_dataset_bert = GPReviewDataset(train_encodings_bert, y_train)
val_dataset_bert = GPReviewDataset(val_encodings_bert, y_val)


In [44]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [45]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [46]:
class BertConcatCLS(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.n_classes = n_classes
    self.out = nn.Linear(4*self.bert.config.hidden_size, self.n_classes)
  
  def forward(self, input_ids, attention_mask, labels):
    _, ps, hidden_states = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)

    pooled_output = torch.stack(hidden_states[-4:], dim=0) #stack last 3 layers, layers*batch*maxlen*embdim
    pooled_output = pooled_output[:, :, 0, :]  #take first token for CLS, layers*batch*emb_dim
    pooled_output = torch.cat(tuple([layer for layer in pooled_output]), dim=-1) #concat by layers batch*3emb_dim
    
    logits = self.out(self.drop(pooled_output))  #*batch*num_classes(3)

    loss = None
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
    
    return loss, logits
            

In [47]:
model = BertConcatCLS(7)
model = model.to(device)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/re

In [49]:
trainer_bert = Trainer(
    model=model,                      
    args=training_args,                  
    train_dataset=train_dataset_bert,        
    eval_dataset=val_dataset_bert,           
    compute_metrics = compute_metrics    
)

trainer_bert.train()

***** Running training *****
  Num examples = 6972
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1308


Step,Training Loss
10,2.051
20,2.0167
30,1.9129
40,1.7991
50,1.7895
60,1.7299
70,1.6188
80,1.5167
90,1.4363
100,1.6171


Saving model checkpoint to ./results/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Saving model checkpoint to ./results/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1308, training_loss=0.9469266876532761, metrics={'train_runtime': 1405.8947, 'train_samples_per_second': 14.877, 'train_steps_per_second': 0.93, 'total_flos': 0.0, 'train_loss': 0.9469266876532761, 'epoch': 3.0})

In [50]:
trainer_bert.evaluate()
trainer_bert.evaluate(eval_dataset=val_dataset_bert, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 270
  Batch size = 32


***** Running Evaluation *****
  Num examples = 270
  Batch size = 32


{'epoch': 3.0,
 'test_accuracy': 0.6481481481481481,
 'test_f1': 0.5741362651737936,
 'test_loss': 1.2157377004623413,
 'test_precision': 0.6250024295432458,
 'test_recall': 0.5719601175420446,
 'test_runtime': 6.3705,
 'test_samples_per_second': 42.383,
 'test_steps_per_second': 1.413}

## DistilRoberta

In [51]:
PRE_TRAINED_MODEL_NAME = 'j-hartmann/emotion-english-distilroberta-base'

In [None]:
tokenizer_distroberta = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

train_encodings_distroberta = tokenizer_distroberta(x_train.tolist(), truncation=True, padding=True)
val_encodings_distroberta = tokenizer_distroberta(x_val.tolist(), truncation=True, padding=True)

train_dataset_distroberta = GPReviewDataset(train_encodings_distroberta, y_train)
val_dataset_distroberta = GPReviewDataset(val_encodings_distroberta, y_val)

In [53]:
training_args = TrainingArguments(
    output_dir='./results_roberta',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [57]:
class DistilRobertaConcatCLS(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.distilroberta = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.n_classes = n_classes
    self.out = nn.Linear(4*self.distilroberta.config.hidden_size, self.n_classes)
  
  def forward(self, input_ids, attention_mask, labels):
    _, ps, hidden_states = self.distilroberta(
      input_ids=input_ids,
      attention_mask=attention_mask,
      return_dict=False, 
      output_hidden_states=True)

    pooled_output = torch.stack(hidden_states[-4:], dim=0) #stack last 3 layers, layers*batch*maxlen*embdim
    pooled_output = pooled_output[:, :, 0, :]  #take first token for CLS, layers*batch*emb_dim
    pooled_output = torch.cat(tuple([layer for layer in pooled_output]), dim=-1) #concat by layers batch*3emb_dim
    
    logits = self.out(self.drop(pooled_output))  #*batch*num_classes(3)

    loss = None
    loss_fct = nn.CrossEntropyLoss()
    loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
    
    return loss, logits
            

In [None]:
model = DistilRobertaConcatCLS(7)
model = model.to(device)

In [59]:
trainer_distroberta = Trainer(
    model=model,                      
    args=training_args,                  
    train_dataset=train_dataset_distroberta,        
    eval_dataset=val_dataset_distroberta,           
    compute_metrics = compute_metrics    
)

trainer_distroberta.train()

***** Running training *****
  Num examples = 6972
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 872


Step,Training Loss
10,2.0049
20,2.001
30,2.0006
40,1.9127
50,1.7707
60,1.7453
70,1.6642
80,1.5327
90,1.4126
100,1.5478


Saving model checkpoint to ./results_roberta/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=872, training_loss=1.166189538229496, metrics={'train_runtime': 531.2392, 'train_samples_per_second': 26.248, 'train_steps_per_second': 1.641, 'total_flos': 0.0, 'train_loss': 1.166189538229496, 'epoch': 2.0})

In [60]:
trainer_distroberta.evaluate()
trainer_distroberta.evaluate(eval_dataset=val_dataset_distroberta, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 270
  Batch size = 32


***** Running Evaluation *****
  Num examples = 270
  Batch size = 32


{'epoch': 2.0,
 'test_accuracy': 0.6222222222222222,
 'test_f1': 0.5221066249176544,
 'test_loss': 0.9772651791572571,
 'test_precision': 0.5664523675769271,
 'test_recall': 0.525744413700479,
 'test_runtime': 2.9546,
 'test_samples_per_second': 91.384,
 'test_steps_per_second': 3.046}

## Test

In [61]:
x_test_bert = tokenizer(test_df.tolist(), truncation=True, padding=True)
x_test_roberta = tokenizer_distroberta(test_df.tolist(), truncation=True, padding=True)

In [63]:
del x_test_bert['token_type_ids']

In [64]:
text_roberta_dataset = GPReviewDataset(x_test_roberta, [0 for i in range(525)])
text_bert_dataset = GPReviewDataset(x_test_bert, [0 for i in range(525)])

In [65]:
result_distilroberta = trainer_distroberta.predict(text_roberta_dataset)

***** Running Prediction *****
  Num examples = 525
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
result_bert = trainer_bert.predict(text_bert_dataset)

***** Running Prediction *****
  Num examples = 525
  Batch size = 32


  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
preds_t5 = pd.read_csv('/content/predictions_EMO.tsv', sep="\t", header=None)


In [68]:
preds_t5 = np.array(preds_t5[0].map(em2id))

In [69]:
predictions_roberta = result_distilroberta.predictions
predictions_bert = result_bert.predictions

preds_roberta = np.argmax(predictions_roberta, axis=-1)   
preds_bert = np.argmax(predictions_bert, axis=-1)

In [70]:
def majority_voting(pred1, pred2, pred3):
    major_pred = []
    for i in range(len(pred1)):
        if pred1[i] != pred2[i]:
            if pred2[i] == pred3[i]:
                major_pred.append(pred2[i])
            else:
                major_pred.append(pred1[i])
        else:
            major_pred.append(pred1[i])
    return np.array(major_pred)

        

In [72]:
test_rez = majority_voting(preds_t5, preds_roberta, preds_bert)

In [73]:
test_rez_df = pd.DataFrame()
test_rez_df['rez'] = test_rez
test_rez_df['rez'] = test_rez_df['rez'].map(id2em)


In [74]:
test_rez_df

Unnamed: 0,rez
0,sadness
1,neutral
2,sadness
3,sadness
4,neutral
...,...
520,joy
521,anger
522,sadness
523,sadness


In [75]:
test_rez_df.to_csv('final_preds.tsv', index=False, header=False, sep='\t')