In [1]:
!pip install datasets transformers huggingface_hub scikit-learn



In [2]:
from datasets import concatenate_datasets, load_dataset
import pandas as pd
from datasets import Dataset

In [3]:
data_dir = '../data/'
data_files = {"train": data_dir+"train_sa.csv", "dev": data_dir+"dev_sa.csv", "test": data_dir+"test_sa.csv"}
dataset_splits = {}
for split, filepath in data_files.items():
    df = pd.read_csv(data_dir+filepath)
    df = pd.DataFrame(df)
    dataset_splits[split] = Dataset.from_pandas(df, split=split)

In [4]:
dataset_splits

{'train': Dataset({
     features: ['review', 'len_review_text', 'review_title', 'review_text', 'overall_rating', 'rating'],
     num_rows: 21708
 }),
 'dev': Dataset({
     features: ['review', 'len_review_text', 'review_title', 'review_text', 'overall_rating', 'rating'],
     num_rows: 9038
 }),
 'test': Dataset({
     features: ['submission_date', 'reviewer_id', 'product_id', 'product_name', 'site_category_lv1', 'review_title', 'overall_rating', 'recommend_to_a_friend', 'review_text', 'review', 'len_review_text', 'rating'],
     num_rows: 22559
 })}

In [5]:
dataset_splits['train'][0]

{'review': 'NÃO RECOMENDO A LOJA ÓTICA SHOP COMO PARCEIRO DA LOJAS AMERICANAS\nIndependente do "parceiro" estou muito decepcionada com a má-fé da lojas americanas.... não ser trata de equívoco, enviar um modelo próximo  ao q foi comprado é equívoco, já enviar um modelo q não tem absolutamente nada a ver com o q foi comprado (pedi dourado veio prata, tamanho da caixa, largura pulseira, tudo muito diferente) é má fé!!!!!!  Ainda mais porque a nota fiscal veio certa, o produto q não.   É um absurdooooo!!!!!!!!!!!!!!',
 'len_review_text': 439,
 'review_title': 'NÃO RECOMENDO A LOJA ÓTICA SHOP COMO PARCEIRO DA LOJAS AMERICANAS',
 'review_text': 'Independente do "parceiro" estou muito decepcionada com a má-fé da lojas americanas.... não ser trata de equívoco, enviar um modelo próximo  ao q foi comprado é equívoco, já enviar um modelo q não tem absolutamente nada a ver com o q foi comprado (pedi dourado veio prata, tamanho da caixa, largura pulseira, tudo muito diferente) é má fé!!!!!!  Ainda

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased", max_seq_length=512)

In [7]:
def preprocess_function(examples):
    sentences_out = tokenizer(examples["review"], truncation=True, max_length=512)
    sentences_out['labels'] = []
    for rating in examples['rating']:
        sentences_out['labels'].append(0 if rating == -1 else 1)
    return sentences_out
 
tokenized_train = dataset_splits["train"].shuffle().map(preprocess_function, batched=True)
tokenized_dev = dataset_splits["dev"].shuffle().map(preprocess_function, batched=True)
tokenized_test = dataset_splits["test"].shuffle().map(preprocess_function, batched=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("neuralmind/bert-base-portuguese-cased", num_labels=2)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [10]:
import numpy as np
from datasets import load_metric
 
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    #load_roc_auc = load_metric("roc_auc")
    load_precision = load_metric("precision")
    load_recall = load_metric("recall")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    print('labels', labels, predictions)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    #roc_auc = load_roc_auc.compute(prediction_scores=logits, references=labels)["roc_auc"]
    precision = load_precision.compute(predictions=predictions, references=labels)["precision"]
    recall = load_recall.compute(predictions=predictions, references=labels)["recall"]
    return {
            "accuracy": accuracy,
            "f1": f1,
            #"roc_auc": roc_auc,
            "precision": precision,
            "recall": recall,
    }

In [14]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-v0"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=False,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=100,
    label_names=["labels"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    #est_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: rating, overall_rating, review, review_text, review_title, len_review_text. If rating, overall_rating, review, review_text, review_title, len_review_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 21708
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 2714
  Number of trainable parameters = 108924674


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
20,No log,0.184843,0.956074,0.967997,0.992561,0.944619


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: rating, overall_rating, review, review_text, review_title, len_review_text. If rating, overall_rating, review, review_text, review_title, len_review_text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9038
  Batch size = 8


labels [1 1 1 ... 1 1 0] [1 1 1 ... 1 1 0]


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 8.00 GiB total capacity; 6.82 GiB already allocated; 0 bytes free; 7.24 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [16]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: review_text, review, len_review_text, rating, review_title, overall_rating. If review_text, review, len_review_text, rating, review_title, overall_rating are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9038
  Batch size = 8


labels [1 1 1 ... 1 1 1] [1 1 1 ... 1 1 0]


{'eval_loss': 0.10896646231412888,
 'eval_accuracy': 0.9731135206904182,
 'eval_f1': 0.9806173725771715,
 'eval_precision': 0.9944992719624656,
 'eval_recall': 0.9671176840780366,
 'eval_runtime': 24.0212,
 'eval_samples_per_second': 376.251,
 'eval_steps_per_second': 47.042,
 'epoch': 2.0}

In [17]:
trainer.evaluate(tokenized_test)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: review_text, review, len_review_text, product_id, reviewer_id, site_category_lv1, submission_date, rating, product_name, recommend_to_a_friend, review_title, overall_rating. If review_text, review, len_review_text, product_id, reviewer_id, site_category_lv1, submission_date, rating, product_name, recommend_to_a_friend, review_title, overall_rating are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 22559
  Batch size = 8


labels [0 1 1 ... 1 1 1] [0 1 1 ... 1 1 1]


{'eval_loss': 0.10881324857473373,
 'eval_accuracy': 0.973757702025799,
 'eval_f1': 0.9811007534159111,
 'eval_precision': 0.9912269384595536,
 'eval_recall': 0.9711793704967766,
 'eval_runtime': 56.8966,
 'eval_samples_per_second': 396.491,
 'eval_steps_per_second': 49.564,
 'epoch': 2.0}

In [18]:
trainer.predict(tokenized_test)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: review_text, review, len_review_text, product_id, reviewer_id, site_category_lv1, submission_date, rating, product_name, recommend_to_a_friend, review_title, overall_rating. If review_text, review, len_review_text, product_id, reviewer_id, site_category_lv1, submission_date, rating, product_name, recommend_to_a_friend, review_title, overall_rating are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 22559
  Batch size = 8


labels [0 1 1 ... 1 1 1] [0 1 1 ... 1 1 1]


PredictionOutput(predictions=array([[ 3.526227 , -3.4901795],
       [-3.490226 ,  3.7005675],
       [-3.02391  ,  3.1761894],
       ...,
       [-3.4590077,  3.66023  ],
       [-3.4008791,  3.5834813],
       [-3.4836538,  3.748986 ]], dtype=float32), label_ids=array([0, 1, 1, ..., 1, 1, 1]), metrics={'test_loss': 0.10881324857473373, 'test_accuracy': 0.973757702025799, 'test_f1': 0.9811007534159111, 'test_precision': 0.9912269384595536, 'test_recall': 0.9711793704967766, 'test_runtime': 59.0316, 'test_samples_per_second': 382.151, 'test_steps_per_second': 47.771})

In [2]:
from transformers import pipeline
pipe = pipeline('text-classification', model='finetuning-sentiment-model-v0/checkpoint-2714/')

In [5]:
pipe(["Celular muito bom Super recomendo E sim chegou com 12 dias de antecedência Muito bom mesmo", "Olá! Não, só possuímos o modelo com esse tamanho de tela. Permanecemos a disposição!"])

[{'label': 'LABEL_1', 'score': 0.9990617632865906},
 {'label': 'LABEL_0', 'score': 0.9875191450119019}]

In [6]:
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from typing import Optional
import pandas as pd
import os
from tqdm.notebook import tqdm

class CustomBERT(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return pooled_output, logits

tokenizer = AutoTokenizer.from_pretrained("finetuning-sentiment-model-v0/checkpoint-2714/")
model = CustomBERT.from_pretrained("finetuning-sentiment-model-v0/checkpoint-2714/")


data_dir = '../data/'
data_files = ["train_rec", "test"]
for filepath in data_files:
    df = pd.read_csv(data_dir+filepath+'.csv')
    embeddings = []
    scores = []
    for text in tqdm(df['review']):
        input_text = text
        # tokenizer-> token_id
        input_ids = tokenizer.encode(input_text, add_special_tokens=True, max_length=512)
        # input_ids: [101, 2182, 2003, 2070, 3793, 2000, 4372, 16044, 102]
        input_ids = torch.tensor([input_ids])
        pooled_output, logits = model(input_ids)

        embedding = pooled_output[0].cpu().detach().numpy()
        score = torch.nn.functional.softmax(logits[0]).cpu().detach().numpy()[1]
        
        embeddings.append(embedding)
        scores.append(score)
    df['embedding'] = embeddings
    df['scores'] = scores
    df.to_parquet(data_dir+filepath+'.parquet')

  0%|          | 0/81318 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  score = torch.nn.functional.softmax(logits[0]).cpu().detach().numpy()[1]


  0%|          | 0/25757 [00:00<?, ?it/s]

  score = torch.nn.functional.softmax(logits[0]).cpu().detach().numpy()[1]


In [4]:
df.to_parquet('../data/t.parquet')

In [40]:
model(input_ids).cpu().detach().numpy()

AttributeError: 'tuple' object has no attribute 'cpu'

In [45]:
torch.nn.functional.softmax(model(input_ids)[1][0]).cpu().detach().numpy()

  torch.nn.functional.softmax(model(input_ids)[1][0]).cpu().detach().numpy()


array([0.8667743, 0.1332257], dtype=float32)