# GIST-Embedding-v0 Model
## 6 Emotions + Neutral

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Import Datasets

In [2]:
df_train = pd.read_csv('train6.csv')
df_test = pd.read_csv('test6.csv')

### Model Parameters

In [3]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 20
LEARNING_RATE = 2e-5
model_id = 'avsolatorio/GIST-Embedding-v0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
target_cols = [col for col in df_train.columns if col not in ['text']]

Downloading tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load Datasets and Model

In [4]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            str(text),
            truncation = True,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']

        return {
            'ids' : torch.tensor(ids, dtype = torch.long),
            'mask' : torch.tensor(mask, dtype = torch.long),
            'token_type_ids' : torch.tensor(token_type_ids, dtype = torch.long),
            'targets' : torch.tensor(self.targets[index], dtype = torch.float)
        }

In [5]:
train_dataset = MyDataset(df_train, tokenizer, MAX_LEN)
test_dataset = MyDataset(df_test, tokenizer, MAX_LEN)

In [6]:
train_loader = DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE,
                          num_workers = 4, shuffle = True, pin_memory = True)
test_loader = DataLoader(test_dataset, batch_size = VALID_BATCH_SIZE,
                          num_workers = 4, shuffle = False, pin_memory = True)

In [7]:
class ModelClass(torch.nn.Module):
    def __init__(self):
        super(ModelClass, self).__init__()
        self.roberta = AutoModel.from_pretrained(model_id)
        self.fc = torch.nn.Linear(768, 7)

    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.fc(features)
        return output
    
model = ModelClass()
model.to(device)

Downloading config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

ModelClass(
  (roberta): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [8]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = AdamW(params = model.parameters(), lr = LEARNING_RATE, weight_decay = 1e-6)

### Model Training

In [9]:
def train(epoch):
    model.train()
    for _, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%700 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [10]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.6983000636100769
Epoch: 1, Loss:  0.3168036937713623
Epoch: 2, Loss:  0.35593801736831665
Epoch: 3, Loss:  0.2633029818534851
Epoch: 4, Loss:  0.3136291205883026
Epoch: 5, Loss:  0.20862999558448792
Epoch: 6, Loss:  0.1560714840888977
Epoch: 7, Loss:  0.13960710167884827
Epoch: 8, Loss:  0.08609598129987717
Epoch: 9, Loss:  0.09010930359363556
Epoch: 10, Loss:  0.08847220987081528
Epoch: 11, Loss:  0.040185511112213135
Epoch: 12, Loss:  0.06265750527381897
Epoch: 13, Loss:  0.04331382364034653
Epoch: 14, Loss:  0.02921956405043602
Epoch: 15, Loss:  0.028555866330862045
Epoch: 16, Loss:  0.03306639939546585
Epoch: 17, Loss:  0.030710652470588684
Epoch: 18, Loss:  0.04199114069342613
Epoch: 19, Loss:  0.009549155831336975


### Model Testing

In [11]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [12]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
precision_micro = metrics.precision_score(targets, outputs, average = 'micro')
precision_macro = metrics.precision_score(targets, outputs, average = 'macro')
recall_micro = metrics.recall_score(targets, outputs, average = 'micro')
recall_macro = metrics.recall_score(targets, outputs, average = 'macro')
print(f"Accuracy = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print(f"Precision (Micro) = {precision_micro}")
print(f"Precision (Macro) = {precision_macro}")
print(f"Recall (Micro) = {recall_micro}")
print(f"Recall (Macro) = {recall_macro}")

Accuracy = 0.280239039246107
F1 Score (Micro) = 0.6927572587532024
F1 Score (Macro) = 0.6082000081097119
Precision (Micro) = 0.6742597402597402
Precision (Macro) = 0.5925841562271256
Recall (Micro) = 0.7122983207112282
Recall (Macro) = 0.6274014297370183
