In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate

from transformers import AutoTokenizer
from transformers import BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

from transformers import AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MAX_LENGTH  = 512
TRAIN_RATIO = 0.7
VAL_RATIO   = 0.2
TEST_RATIO  = 0.1
BATCH_SIZE  = 16 

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'{device=}')

device=device(type='cuda', index=0)


In [4]:
df = pd.read_csv('../../go_emotions_dataset.csv')

In [5]:
for i, col in enumerate(list(df.columns)):
    print(i, col)

0 id
1 text
2 example_very_unclear
3 admiration
4 amusement
5 anger
6 annoyance
7 approval
8 caring
9 confusion
10 curiosity
11 desire
12 disappointment
13 disapproval
14 disgust
15 embarrassment
16 excitement
17 fear
18 gratitude
19 grief
20 joy
21 love
22 nervousness
23 optimism
24 pride
25 realization
26 relief
27 remorse
28 sadness
29 surprise
30 neutral


In [6]:
classes = {
    '0': ['excitement', 'joy', 'gratitude'],
    '1': ['admiration', 'approval', 'optimism'],
    '2': ['amusement', 'caring', 'desire', 'love'],
    '3': ['curiosity', 'realization', 'surprise'],
    '4': ['anger', 'nervousness'],
    '5': ['annoyance', 'confusion', 'remorse'],
    '6': ['disappointment', 'disapproval', 'disgust', 'embarrassment'],
    '7': ['sadness', 'grief'],
    '8': ['fear']
}


In [7]:
class_to_id = {v: k for k, value in classes.items() for v in value}

In [8]:
columns = class_to_id.keys()

In [9]:
print(df.shape)
df[df[columns].sum(axis=1) > 1].shape

(211225, 31)


(35199, 31)

Existem 35199 amostras com mais de uma class, mas 211225 amostras totais, o que resulta em 176.026 amostras com clase única.

Podemos não dropar, mas sim duplicar as amostras com mais de uma label (isso seria interessante para o caso em que elementos estão na mesma classe)

In [10]:
ids = df[df[columns].sum(axis=1) > 1].index

In [11]:
df['label'] = df.reset_index()['index'].map(df[columns].idxmax(1).to_dict())

In [12]:
df['class'] = df['label'].map(class_to_id)

In [13]:
df.head()

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,label,class
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,sadness,7
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,excitement,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,excitement,0
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,love,2
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,excitement,0


## Pré-processamento

In [14]:
df_labeled = df[['text', 'class']]

In [15]:
def pre_process_text(text):
    preprocessed_text = re.sub(r'\n+', '\n', text)
    preprocessed_text = re.sub(r'http\S+', '', text) # removendo links
    preprocessed_text = preprocessed_text.replace('"', '')    # removendo aspas
    preprocessed_text = re.sub(r"<\S*\ ?\/?>", '', preprocessed_text)
    # preprocessed_text = re.sub("[-*!,$><:.+?=]", '', preprocessed_text) # remove outras pontuações

    # preprocessed_text = re.sub(r'[.]\s+', '', preprocessed_text)  # removendo reticências 
    preprocessed_text = re.sub(r'  ', ' ', preprocessed_text) # removendo espaços extras
    preprocessed_text = re.sub(r'\'', "''", preprocessed_text)
    return preprocessed_text.lower()

In [16]:
df_labeled['preprocessed_text'] = df_labeled['text'].apply(pre_process_text, 1)
df_labeled.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labeled['preprocessed_text'] = df_labeled['text'].apply(pre_process_text, 1)


Unnamed: 0,text,class,preprocessed_text
0,That game hurt.,7,that game hurt.
1,>sexuality shouldn’t be a grouping category I...,0,>sexuality shouldn’t be a grouping category i...
2,"You do right, if you don't care then fuck 'em!",0,"you do right, if you don''t care then fuck ''em!"
3,Man I love reddit.,2,man i love reddit.
4,"[NAME] was nowhere near them, he was by the Fa...",0,"[name] was nowhere near them, he was by the fa..."


## Carregando modelo e tokenizador

In [17]:
base = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base)

In [18]:
df_tokenized = tokenizer(df_labeled['preprocessed_text'].to_list(), return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)

In [19]:
print(df_tokenized['input_ids'].shape, df_tokenized['attention_mask'].shape)

torch.Size([211225, 316]) torch.Size([211225, 316])


In [20]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load('accuracy')

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy}

## Dataloader

In [21]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(y)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.X.items()} 
        item['class'] = self.y[idx]
        return item

In [22]:
df_labeled['class'].to_numpy()

array(['7', '0', '0', ..., '1', '4', '0'], dtype=object)

In [23]:
dataset = TextDataset(df_tokenized, torch.tensor(df_labeled['class'].astype('int').to_numpy()))

In [24]:
n_train_instances = int(np.round(dataset.len * TRAIN_RATIO))
n_val_instances = int(np.round(dataset.len * VAL_RATIO))
n_test_instances = int(np.round(dataset.len * TEST_RATIO))
print(f'Treino: {n_train_instances}, Val.: {n_val_instances}, Teste: {n_test_instances}')

Treino: 147858, Val.: 42245, Teste: 21122


In [25]:
train_split, val_split, test_split = torch.utils.data.random_split(dataset, [n_train_instances, n_val_instances, n_test_instances])

## Treinando com training loop

In [26]:
model_name = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(classes.keys()))
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [27]:
train_loader = torch.utils.data.DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)

In [28]:
epochs = 5
steps_per_epoch = 200
epoch_validation_samples = 50
learning_rate = 2e-5

optim = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [29]:
def model_step(model, batch_data, cur_step, compute_evaluation=False, optimizer=None):
    if cur_step == 'train':
        model.train()
    elif cur_step == 'val':
        model.eval()
    
    input_ids = batch_data['input_ids'].to(device)
    attention_mask = batch_data['attention_mask'].to(device)
    labels = batch_data['class'].to(device)

    output = model(input_ids, attention_mask=attention_mask, labels=labels.long())
    
    loss = output.loss
    logits = output.logits

    if cur_step == 'train':
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    evaluation = None
    if compute_evaluation:
        softmax_predictions = torch.nn.functional.softmax(logits, dim=1)
        evaluation = compute_metrics([softmax_predictions.detach().cpu(), labels])

    return loss.item() * labels.shape[0], evaluation

In [30]:
epoch_data = {}

for i in range(epochs):
    epoch_data[i] = {'train': [], 'validation': []}
    num_train_examples = 0
    num_val_examples = 0

    train_hits = 0
    val_hits = 0

    train_bar = tqdm(total=len(train_loader), desc=f"Train", unit="steps", position=0, leave=False)
    val_bar   = tqdm(total=epoch_validation_samples, desc=f"Validation", unit="samples", position=0, leave=False)

    train_running_loss = 0
    for batch_id, batch_data in enumerate(train_loader):
        if (batch_id + 1) % 500 == 0:
            loss, evaluation = model_step(model, batch_data, 'train', True, optim)
            epoch_data[i]['train'].append(evaluation)
        else:
            loss, _ = model_step(model, batch_data, 'train', False, optim)

        train_running_loss += loss

        train_bar.update(1)

    val_running_loss = 0
    for batch_id, batch_data in enumerate(val_loader):
        loss, evaluation = model_step(model, batch_data, 'val', True)
        
        val_running_loss += loss

        epoch_data[i]['validation'].append(evaluation)
        
        val_bar.update(1)

        if (batch_id + 1) % epoch_validation_samples == 0:
            break
    
    train_acc = np.mean([eval['accuracy'] for eval in epoch_data[i]['train']])
    val_acc = np.mean([eval['accuracy'] for eval in epoch_data[i]['validation']])
    
    train_loss = train_running_loss / len(train_loader.sampler)
    valid_loss = val_running_loss / len(val_loader.sampler)

    print(f"Epoch summary [{i+1}/{epochs}]\t Train loss: {train_loss}\t Train acc: {train_acc}\t Val loss: {valid_loss}\t Val acc: {val_acc}")

Validation: 100%|██████████| 50/50 [53:07<00:00,  1.14samples/s]    

Epoch summary [1/5]	 Train loss: 1.383182343720832	 Train acc: 0.4722222222222222	 Val loss: 0.024501521850650954	 Val acc: 0.5225


Validation: 100%|██████████| 50/50 [52:21<00:00,  1.13samples/s]    

Epoch summary [2/5]	 Train loss: 1.2709584075186922	 Train acc: 0.4895833333333333	 Val loss: 0.02590853598205588	 Val acc: 0.4925


Validation: 100%|██████████| 50/50 [52:17<00:00,  1.17samples/s]    

Epoch summary [3/5]	 Train loss: 1.1902091727762256	 Train acc: 0.5347222222222222	 Val loss: 0.02478266149487689	 Val acc: 0.5025


Validation: 100%|██████████| 50/50 [52:14<00:00,  1.18samples/s]    

Epoch summary [4/5]	 Train loss: 1.1116256486668268	 Train acc: 0.4895833333333333	 Val loss: 0.026169311874702086	 Val acc: 0.50375


Validation: 100%|██████████| 50/50 [52:15<00:00,  1.17samples/s]    

Epoch summary [5/5]	 Train loss: 1.038097089735516	 Train acc: 0.5694444444444444	 Val loss: 0.02846104448851022	 Val acc: 0.47875


In [31]:
torch.save(model.state_dict(), './model_weights_loop.pth')