In [1]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate

from transformers import AutoTokenizer
from transformers import BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

ModuleNotFoundError: No module named 'numpy'

In [2]:
MAX_LENGTH  = 512
TRAIN_RATIO = 0.7
VAL_RATIO   = 0.2
TEST_RATIO  = 0.1
BATCH_SIZE  = 16 

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'{device=}')

In [None]:
df = pd.read_csv('go_emotions_dataset.csv')

In [None]:
for i, col in enumerate(list(df.columns)):
    print(i, col)

In [None]:
positive_columns = ['admiration', 'amusement', 'approval', 'caring', 'curiosity', 'desire', 'excitement', 'gratitude', 'joy', 'love', 'optimism', 'pride', 'realization', 'surprise']
negative_columns = ['anger', 'annoyance', 'confusion', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'fear', 'grief', 'nervousness', 'relief', 'remorse', 'sadness']
neutral_columns  = ['neutral']

In [None]:
positive_emotions_count = {}
for emotion in positive_columns:
    positive_emotions_count[emotion] = df[emotion].value_counts().to_dict()[1]

In [None]:
plt.figure(figsize=(10,8))
plt.bar(x=positive_emotions_count.keys(), height=positive_emotions_count.values())
plt.xticks(rotation=-45)
plt.show

In [None]:
negative_emotions_count = {}
for emotion in negative_columns:
    negative_emotions_count[emotion] = df[emotion].value_counts().to_dict()[1]

In [None]:
plt.figure(figsize=(10,8))
plt.bar(x=negative_emotions_count.keys(), height=negative_emotions_count.values())
plt.xticks(rotation=-45)
plt.show()

In [None]:
count_emotions = {'positive': np.sum(list(positive_emotions_count.values())), 'negative': np.sum(list(negative_emotions_count.values()))}

In [None]:
count_emotions

In [None]:
plt.figure(figsize=(10,5))
plt.bar(x=count_emotions.keys(), height=count_emotions.values())
plt.show()

In [None]:
count_emotions['positive'] - count_emotions['negative']

Como os dados não estão balanceados, iremos pensar em eliminar algumas labels/emoções da classe positiva.

In [None]:
threshold = 5000
ids = []
for emotion in positive_columns:
    if positive_emotions_count[emotion] > threshold:
        ids += df[df[emotion] == 1].sample(threshold)['id'].to_list()
    else:
        ids += df[df[emotion] == 1]['id'].to_list()

In [None]:
print(f'Número de amostras removidas: {len(ids)}')

In [None]:
df_balanced = df[df['id'].isin(ids)]

In [None]:
positive_emotions_count = {}
for emotion in positive_columns:
    positive_emotions_count[emotion] = df_balanced[emotion].value_counts().to_dict()[1]

negative_emotions_count = {}
for emotion in negative_columns:
    negative_emotions_count[emotion] = df_balanced[emotion].value_counts().to_dict()[1]

count_emotions = {'positive': np.sum(list(positive_emotions_count.values())), 'negative': np.sum(list(negative_emotions_count.values()))}

In [None]:
positive_emotions_count

In [None]:
plt.figure(figsize=(10,5))
plt.bar(x=count_emotions.keys(), height=count_emotions.values())
plt.show()

Observamos que existem muitas amostras positivas e com labels negativos simultaneamente. Assim, cortar dessa forma não é uma boa opção

### **Contabilizando o número de labels positivas e negativas para um mesmo texto**

In [None]:
df['num_labels_positives'] = df[positive_columns].apply(lambda x: np.sum(x), 1) 
df['num_labels_negatives'] = df[negative_columns].apply(lambda x: np.sum(x), 1)

In [None]:
print(f"Número de amostras com classificação duvidosa: {df[df['num_labels_positives'] == df['num_labels_negatives']].shape[0]}")
print(f"Tamanho do dataset retirando essas amostras: {df.shape[0] - df[df['num_labels_positives'] == df['num_labels_negatives']].shape[0]}")

Vamos inicialmente remover amostras que possuem um mesmo número de labels positivas e negativas: na matemática da vida, elas se cortam.

In [None]:
balanced_df = df.drop(df[df['num_labels_positives'] == df['num_labels_negatives']].index)

In [None]:
def set_label(row):
    return 1 if row['num_labels_positives'] > row['num_labels_negatives'] else 0

In [None]:
balanced_df['imdb_label'] = balanced_df.apply(set_label, 1)

## **Adaptando os dados para o nosso modelo**


In [None]:
df_labeled = balanced_df[['text', 'imdb_label']].copy()
df_labeled.head()

#### **Pré-processamento**

In [None]:
def preprocess_with_emojis(text):
    preprocessed_text = re.sub(r'http\S+', '', text) # removendo links
    preprocessed_text = preprocessed_text.replace('"', '')    # removendo aspas
    preprocessed_text = re.sub("[-*!,$><:.+?=]", '', preprocessed_text) # remove outras pontuações

    preprocessed_text = re.sub(r'[.]\s+', '', preprocessed_text)  # removendo reticências 
    # emojis = re.compile("["
    #     u"\U0001F600-\U0001F64F"  # emoticons
    #     u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    #     u"\U0001F680-\U0001F6FF"  # transport & map symbols
    #     u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    #     u"\U00002500-\U00002BEF"  # chinese char
    #     u"\U00002702-\U000027B0"
    #     u"\U00002702-\U000027B0"
    #     u"\U000024C2-\U0001F251"
    #     u"\U0001f926-\U0001f937"
    #     u"\U00010000-\U0010ffff"
    #     u"\u2640-\u2642" 
    #     u"\u2600-\u2B55"
    #     u"\u200d"
    #     u"\u23cf"
    #     u"\u23e9"
    #     u"\u231a"
    #     u"\ufe0f"  # dingbats
    #     u"\u3030"
    #                   "]+", re.UNICODE)
    # preprocessed_text = re.sub(emojis, ' ', preprocessed_text) # removendo emojis
    preprocessed_text = re.sub(r'  ', ' ', preprocessed_text) # removendo espaços extras
    
    return preprocessed_text.lower()  

def pre_process_text(text):
    preprocessed_text = re.sub(r'http\S+', '', text) # removendo links
    preprocessed_text = preprocessed_text.replace('"', '')    # removendo aspas
    preprocessed_text = re.sub("[-*!,$><:.+?=]", '', preprocessed_text) # remove outras pontuações

    preprocessed_text = re.sub(r'[.]\s+', '', preprocessed_text)  # removendo reticências 
    preprocessed_text = re.sub(r'  ', ' ', preprocessed_text) # removendo espaços extras
    
    return preprocessed_text.lower()

In [None]:
df_labeled['preprocessed_text'] = df_labeled['text'].apply(pre_process_text, 1)
df_labeled.head()

## **Carregando tokenizer e modelo**

In [None]:
base = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base)

In [None]:
df_tokenized = tokenizer(df_labeled['preprocessed_text'].to_list(), return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)

In [None]:
df_tokenized.keys()

In [None]:
print(df_tokenized['input_ids'].shape, df_tokenized['attention_mask'].shape)

In [None]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load('accuracy')
    load_f1 = evaluate.load('f1')

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

### **Dataloader**

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(y)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.X.items()} 
        item['labels'] = self.y[idx]
        return item

In [None]:
dataset = TextDataset(df_tokenized, torch.tensor(df_labeled['imdb_label'].to_numpy()))

In [None]:
n_train_instances = int(np.round(dataset.len * TRAIN_RATIO))
n_val_instances = int(np.round(dataset.len * VAL_RATIO))
n_test_instances = int(np.round(dataset.len * TEST_RATIO))
print(f'Treino: {n_train_instances}, Val.: {n_val_instances}, Teste: {n_test_instances}')

In [None]:
train_split, val_split, test_split = torch.utils.data.random_split(dataset, [n_train_instances, n_val_instances, n_test_instances])

## **Treinando com framework trainer**

In [None]:
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, output_attentions=False, output_hidden_states=False)
model.to(device)

In [None]:
repo_name = "./finetuning-sentiment-go-emotions"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=3,
   evaluation_strategy='epoch',
   save_total_limit=5
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_split,
   eval_dataset=val_split,
   tokenizer=tokenizer,
   data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
torch.save(model.state_dict(), './model_weights_trainer.pth')

## **Training with training loop**

In [None]:
train_loader = torch.utils.data.DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
epochs = 5
steps_per_epoch = 200
epoch_validation_samples = 50

model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, output_attentions=False, output_hidden_states=False)
model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=2e-5)

In [None]:
def model_step(model, batch_data, cur_step, compute_evaluation=False, optimizer=None):
    if cur_step == 'train':
        model.train()
    elif cur_step == 'val':
        model.eval()
    
    input_ids = batch_data['input_ids'].to(device)
    attention_mask = batch_data['attention_mask'].to(device)
    labels = batch_data['labels'].to(device)

    output = model(input_ids, attention_mask=attention_mask, labels=labels.long())
    
    loss = output.loss
    logits = output.logits

    if cur_step == 'train':
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    evaluation = None
    if compute_evaluation:
        softmax_predictions = torch.nn.functional.softmax(logits, dim=1)
        evaluation = compute_metrics([softmax_predictions.detach().cpu(), labels])

    return loss.item() * labels.shape[0], evaluation

In [None]:
epoch_data = {}

for i in range(epochs):
    epoch_data[i] = {'train': [], 'validation': []}
    num_train_examples = 0
    num_val_examples = 0

    train_hits = 0
    val_hits = 0

    train_bar = tqdm(total=len(train_loader), desc=f"Train", unit="steps", position=0, leave=False)
    val_bar   = tqdm(total=epoch_validation_samples, desc=f"Validation", unit="samples", position=0, leave=False)

    train_running_loss = 0
    for batch_id, batch_data in enumerate(train_loader):
        if (batch_id + 1) % 500 == 0:
            loss, evaluation = model_step(model, batch_data, 'train', True, optim)
            epoch_data[i]['train'].append(evaluation)
        else:
            loss, _ = model_step(model, batch_data, 'train', False, optim)

        train_running_loss += loss

        train_bar.update(1)

    val_running_loss = 0
    for batch_id, batch_data in enumerate(val_loader):
        loss, evaluation = model_step(model, batch_data, 'val', True)
        
        val_running_loss += loss

        epoch_data[i]['validation'].append(evaluation)
        
        val_bar.update(1)

        if (batch_id + 1) % epoch_validation_samples == 0:
            break
    
    train_acc = np.mean([eval['accuracy'] for eval in epoch_data[i]['train']])
    val_acc = np.mean([eval['accuracy'] for eval in epoch_data[i]['validation']])
    
    train_loss = train_running_loss / len(train_loader.sampler)
    valid_loss = val_running_loss / len(val_loader.sampler)

    print(f"Epoch summary [{i+1}/{epochs}]\t Train loss: {train_loss}\t Train acc: {train_acc}\t Val loss: {valid_loss}\t Val acc: {val_acc}")

In [None]:
torch.save(model.state_dict(), './model_weights_loop.pth')

## **Avaliando modelo no IMDB**

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

import evaluate

from transformers import AutoTokenizer
from transformers import BertForSequenceClassification
# from transformers import TrainingArguments, Trainer
# from transformers import DataCollatorWithPadding

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'{device=}')

In [None]:
torch.__version__

In [None]:
import torch
torch.__version__

In [None]:
import torchaudio
torchaudio.__version__

In [None]:
import torchvision
torchvision.__version__

In [None]:
state_dict = torch.load('model_weights_loop.pth')

In [None]:
state_dict = {k: v for k,v in state_dict.items() if k != 'bert.embeddings.position_ids'}

In [None]:
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, output_attentions=False, output_hidden_states=False)
model.load_state_dict(state_dict)
model.to(device)

In [None]:
df_imdb = pd.read_csv('IMDB_Dataset.csv')

In [None]:
df_imdb.head()

In [None]:
df_imdb['sentiment'].unique()

In [None]:
df_imdb['label'] = df_imdb.apply(lambda x: 1 if x.loc['sentiment'] == 'positive' else 0, axis=1)

In [None]:
def preprocess_with_emojis(text):
    preprocessed_text = re.sub(r'http\S+', '', text) # removendo links
    preprocessed_text = preprocessed_text.replace('"', '')    # removendo aspas
    preprocessed_text = re.sub("[-*!,$><:.+?=]", '', preprocessed_text) # remove outras pontuações

    preprocessed_text = re.sub(r'[.]\s+', '', preprocessed_text)  # removendo reticências 
    # emojis = re.compile("["
    #     u"\U0001F600-\U0001F64F"  # emoticons
    #     u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    #     u"\U0001F680-\U0001F6FF"  # transport & map symbols
    #     u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    #     u"\U00002500-\U00002BEF"  # chinese char
    #     u"\U00002702-\U000027B0"
    #     u"\U00002702-\U000027B0"
    #     u"\U000024C2-\U0001F251"
    #     u"\U0001f926-\U0001f937"
    #     u"\U00010000-\U0010ffff"
    #     u"\u2640-\u2642" 
    #     u"\u2600-\u2B55"
    #     u"\u200d"
    #     u"\u23cf"
    #     u"\u23e9"
    #     u"\u231a"
    #     u"\ufe0f"  # dingbats
    #     u"\u3030"
    #                   "]+", re.UNICODE)
    # preprocessed_text = re.sub(emojis, ' ', preprocessed_text) # removendo emojis
    preprocessed_text = re.sub(r'  ', ' ', preprocessed_text) # removendo espaços extras
    
    return preprocessed_text.lower()  

def pre_process_text(text):
    preprocessed_text = re.sub(r'http\S+', '', text) # removendo links
    preprocessed_text = preprocessed_text.replace('"', '')    # removendo aspas
    preprocessed_text = re.sub(r"<\S*\ ?\/?>", '', preprocessed_text)
    preprocessed_text = re.sub("[-*!,$><:.+?=]", '', preprocessed_text) # remove outras pontuações

    preprocessed_text = re.sub(r'[.]\s+', '', preprocessed_text)  # removendo reticências 
    preprocessed_text = re.sub(r'  ', ' ', preprocessed_text) # removendo espaços extras
    
    return preprocessed_text.lower()

In [None]:
df_imdb['preprocessed_text'] = df_imdb['review'].apply(pre_process_text)

In [None]:
df_imdb.head()

In [None]:
base = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base)

In [None]:
imdb_tokenized = tokenizer.encode(df_imdb['preprocessed_text'].to_list(), return_tensors='pt', padding=True, truncation=True)

In [None]:
imdb_tokenized

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(y)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.X.items()} 
        item['labels'] = self.y[idx]
        return item

In [None]:
dataset = TextDataset(imdb_tokenized, torch.tensor(df_imdb['label'].to_numpy()))

In [None]:
imdb_tokenized.keys()

In [None]:
imdb_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
len(imdb_dataloader)

In [None]:
next(iter(imdb_dataloader))

In [None]:
metric = evaluate.load("accuracy")

bar = tqdm(total=len(imdb_dataloader), desc=f"Evaluation on imdb-dataset", unit="steps", position=0, leave=False)

model.eval()
for batch in imdb_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch.keys())
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    bar.update(1)
metric.compute()

In [None]:
outputs

## TODO: 
- Mudar de classes binárias (positivo/negativo) para múltiplas classes: alegria, tristeza, raiva,...
- Incrementar e pesquisar mais sobre o pré-processamento para modelos de linguagem natural + BERT
- Treinar o modelo
- Comparar resultados de alguma forma com o IMDB_dataset