In [1]:
!pip install transformers
!pip install torch



In [2]:
import pandas as pd

In [3]:
tripadvisor_data = pd.read_csv('https://raw.githubusercontent.com/isalbineli/testes/main/BASE_TREINO_LISTA.csv',sep = ',')

df = tripadvisor_data.sample(frac=.4).reset_index(drop=True)

df.head()

Unnamed: 0,review_text,ambiente,comida,geral,preco,servico,seguranca
0,"Local bonito, bem frequentado, preços médios, ...",1,1,2,1,0,2
1,"Ótimo restaurante, a comida é deliciosa, o pre...",1,1,1,1,1,2
2,É um restaurante premiado com preços justos pe...,2,2,1,1,2,2
3,"Variedades de pratos e sobremesas, espaço acon...",1,1,2,1,2,2
4,Frequento o rodízio de massas ao menos um domi...,1,1,2,1,2,2


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
#label_mapping = {'negativo': 0, 'positivo': 1, 'neutro': 2}


In [6]:
#df.replace(label_mapping, inplace=True)

#df.head()

In [7]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# Tokenizar os textos
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [9]:
def tokenize_data(data, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []

    for text in data['review_text']:
        encoded_text = tokenizer.encode_plus(
            text,
            max_length=max_len,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    labels = torch.tensor(data[['ambiente', 'servico', 'comida', 'geral', 'preco', 'seguranca']].values)

    return TensorDataset(input_ids, attention_masks, labels)

In [10]:
train_dataset = tokenize_data(train_data, tokenizer)
test_dataset = tokenize_data(test_data, tokenizer)

In [11]:
# Criar DataLoader
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
# Modelo BERT para classificação de sequência
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=3)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from torch import nn
model.classifier = nn.Linear(model.config.hidden_size, 6)

In [14]:
# Função de perda
criterion = torch.nn.CrossEntropyLoss()



In [15]:
# Treinamento do modelo
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, torch.argmax(labels, dim=1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')



Epoch 1/5, Loss: 0.7395
Epoch 2/5, Loss: 0.3127
Epoch 3/5, Loss: 0.2310
Epoch 4/5, Loss: 0.1671
Epoch 5/5, Loss: 0.1303


In [16]:
# Avaliação do modelo
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).tolist()
        true_labels = torch.argmax(labels, dim=1).tolist()

        all_preds.extend(preds)
        all_labels.extend(true_labels)

accuracy = accuracy_score(all_labels, all_preds)
classification_rep = classification_report(all_labels, all_preds)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_rep)

Accuracy: 0.8848
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.88      0.92       199
           1       0.85      0.97      0.91       114
           2       0.75      0.60      0.67        10
           3       0.64      0.92      0.75        25
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         1

    accuracy                           0.88       356
   macro avg       0.53      0.56      0.54       356
weighted avg       0.88      0.88      0.88       356



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# Localizar a pasta com os arquivos
import os
os.chdir('/content/drive/MyDrive/MODELOS')

# New Section

In [18]:
# Salvar o modelo
caminho_do_arquivo = '/content/drive/MyDrive/MODELOS/modelo_multirotulomulticlasse_vf3.pth'
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, caminho_do_arquivo)

In [None]:
import matplotlib.pyplot as plt

# Plotando a acurácia de treinamento e validação
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

# Configurações do gráfico
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])

# Exibindo o gráfico
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

# Cálculo da matriz de confusão
y_pred = torch.argmax(model(**test_encodings).logits, dim=1)
cm = confusion_matrix(val_labels, y_pred)

# Exibição da matriz de confusão
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
df_dadosparaanalise = pd.read_csv('https://raw.githubusercontent.com/zeneto11/paper-multilabel_text_classifier_restaurant_reviews/main/estudo_de_caso/dataset-coco_bambu.csv',sep = ',')

In [None]:
#Criação do dataframe
df2 = df_dadosparaanalise.sample(frac=0.0001).reset_index(drop=True)

# Remover sentenças nulas
df2 = df2.dropna().reset_index(drop=True)

df2.head()

In [None]:
# Verificar número de comentários
total_comentarios = df2['comentario'].nunique()
print("Total de comentários:", total_comentarios)

In [None]:
texto = df2['comentario'].tolist()

In [None]:


def analyze_sentiment(text, model, tokenizer):
    # Tokeniza o texto
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Obtém as previsões do modelo
    with torch.no_grad():
        outputs = model(**inputs)

    # Obtém as probabilidades para cada classe
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1).tolist()[0]

    # Mapeia as probabilidades para rótulos
    class_mapping = {0: 'ambiente', 1: 'comida', 2: 'geral', 3: 'preco', 4: 'servico', 5: 'seguranca'}
    sentiment_labels = {class_mapping[i]: prob for i, prob in enumerate(probabilities)}

    return sentiment_labels

# Exemplo de uso
texto_a_analisar = "Eu realmente gostei do serviço, mas a comida estava apenas ok."
sentimentos = analyze_sentiment(texto_a_analisar, model, tokenizer)

print(f'Sentimentos Previstos: {sentimentos}')


In [None]:
def analyze_sentiment(text, model, tokenizer):
    # Tokeniza o texto
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Obtém as previsões do modelo
    with torch.no_grad():
        outputs = model(**inputs)

    # Obtém as probabilidades para cada classe
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1).tolist()[0]

    # Mapeia as probabilidades para rótulos e converte para a escala 0-2
    class_mapping = {0: 'ambiente', 1: 'comida', 2: 'geral', 3: 'preco', 4: 'servico', 5: 'seguranca'}
    sentiment_labels = {class_name: int(prob * 2) for class_name, prob in zip(class_mapping.values(), probabilities)}

    return sentiment_labels

# Exemplo de uso
texto_a_analisar = "lugar ruim"
sentimentos = analyze_sentiment(texto_a_analisar, model, tokenizer)

print(f'Sentimentos Previstos: {sentimentos}')