<a href="https://colab.research.google.com/github/juliendymendes/review-classifier/blob/main/trabalho2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Construção do dataset

In [None]:
!pip install datasets

In [42]:
import numpy as np
from datasets import load_dataset
import torch
import torch.nn as nn

from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertForSequenceClassification

import pandas as pd
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [44]:
os.chdir('/content/drive/MyDrive/datasets/amazon')

In [45]:
data = pd.read_csv('reviews.csv')

In [None]:
data['label'].value_counts()

In [47]:
ytrain_global = np.array(data['label'].tolist())[:1000]
xtrain_global = np.array(data['text'])[:1000]

In [None]:
np.unique(ytrain_global,return_counts=True)

## Dataloader

In [49]:
import sklearn.model_selection as model_selection

In [50]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [51]:
xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain_global, ytrain_global, test_size=0.30, random_state=42,shuffle=True)

In [52]:
train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True,max_length=512, return_tensors='pt')

In [53]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx])
        return (item,label)

    def __len__(self):
        return len(self.labels)

In [54]:
ds_train = MyDataset(train_encodings,ytrain)
ds_val   = MyDataset(val_encodings,yval)

In [55]:
from torch.utils.data import DataLoader

In [56]:
dl_train = DataLoader(ds_train,batch_size=8)
dl_eval  = DataLoader(ds_val,batch_size=8)

In [57]:
x,y = next(iter(dl_train))

In [58]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [59]:
model = BertModel.from_pretrained("bert-base-multilingual-cased")

In [None]:
model.to(device)

In [61]:
batch = {k: v.to(device) for k, v in x.items()}

In [62]:
out = model(**batch)

In [None]:
out.pooler_output

In [None]:
model.eval()

In [65]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

In [66]:
from transformers import get_scheduler

In [67]:
num_epochs = 10

In [68]:
loss_fct = nn.CrossEntropyLoss()

In [None]:
for epoch in range(num_epochs):
    lepochs = []
    for batch,y in dl_train:
        batch = {k: v.to(device) for k, v in batch.items()}
        y     = y.to(device)
        outputs = model(**batch)
        loss = loss_fct(outputs.pooler_output,y)
        loss.backward()
        lepochs.append(loss.cpu().item())
        optimizer.step()
        #lr_scheduler.step()
        optimizer.zero_grad()
    print(np.mean(lepochs))

In [70]:
ytrue = []
ypred = []
for batch,y in dl_eval:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    predictions = torch.argmax(outputs.pooler_output, dim=-1)
    ytrue += y.tolist()
    ypred += predictions.cpu().tolist()

## Avaliação do modelo

In [71]:
from sklearn import metrics

In [72]:
print(metrics.classification_report(ytrue,ypred))

              precision    recall  f1-score   support

           0       0.88      0.41      0.56        17
           1       0.55      0.92      0.69        13

    accuracy                           0.63        30
   macro avg       0.71      0.67      0.62        30
weighted avg       0.73      0.63      0.61        30



## Classification report - bert-base-multilingual-cased

   

```
    precision    recall  f1-score   support

           0       0.88      0.41      0.56        17
           1       0.55      0.92      0.69        13

    accuracy                           0.63        30
   macro avg       0.71      0.67      0.62        30
weighted avg       0.73      0.63      0.61        30
```



## Classification report - distilbert-base-uncased



```
    precision    recall  f1-score   support

           0       0.65      0.65      0.65        17
           1       0.54      0.54      0.54        13

    accuracy                           0.60        30
   macro avg       0.59      0.59      0.59        30
weighted avg       0.60      0.60      0.60        30
```



## Bot do Telegram


In [None]:
!pip install python-telegram-bot --upgrade

In [74]:
from telegram import ForceReply, Update
from telegram.ext import Application, CommandHandler, ContextTypes, MessageHandler, filters

In [75]:
async def echo(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    review = update.message.text
    model.to(device)
    text = review
    inputs = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    tensor_output = outputs.pooler_output
    softmax_result = torch.nn.functional.softmax(tensor_output, dim=-1)
    pred = torch.argmax(softmax_result, dim=-1).item()
    msg = ''
    if pred == 0:
        msg = "Esse é um comentário negativo!"
    elif pred == 1:
        msg = "Esse é um comentário positivo!"
    else:
        msg = 'Não, entendi. \n'+ text
    await update.message.reply_text(msg)

In [76]:
async def start(update: Update, context):
    await update.message.reply_text("Olá! Envie uma avaliação de produto e direi se é um comentário positivo ou negativo!")

In [77]:
async def main2() -> None:
    """Start the bot."""
    # Create the Application and pass it your bot's token.
    application = Application.builder().token("7970045384:AAGSxyiaCX77efWwZFmKXUf5bPC1YkHpvBo").build()
    await application.initialize()

    # on non command i.e message - echo the message on Telegram
    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, echo))

    # Run the bot until the user
    await application.start()

    await application.updater.start_polling()
    print("Bot iniciado. Envie mensagens no Telegram.")

In [78]:
# application = Application.builder().token("7970045384:AAGSxyiaCX77efWwZFmKXUf5bPC1YkHpvBo").build()
# application.stop()

In [79]:
import nest_asyncio
import asyncio

nest_asyncio.apply()

asyncio.run(main2())


Bot iniciado. Envie mensagens no Telegram.
