# Classificação usando o BERT

In [1]:
!pip install -q datasets transformers[torch]
!pip install -q accelerate -U

## Carregando o dataset de treinamento

In [1]:
from datasets import load_dataset

In [2]:
train = load_dataset('yelp_review_full', split='train')

In [3]:
train

Dataset({
    features: ['label', 'text'],
    num_rows: 650000
})

In [4]:
train[1]

{'label': 1,
 'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars."}

In [5]:
set(train['label'])

{0, 1, 2, 3, 4}

In [6]:
import pandas as pd

In [7]:
pd.DataFrame(train).head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


## Carregando o dataset de teste

In [7]:
import pandas as pd
from datasets import Dataset

In [8]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,ID,text
0,30978,Stopped in for a drink before a show at the Ca...
1,18993,Why am I reviewing this place? Because upon a...
2,1311,I came to Souper Salad right after getting rej...
3,49832,We went to this Bistro based on a recommendati...
4,43775,"Took my first class today, the class was aweso..."


In [9]:
sample_test = Dataset.from_pandas(df_test)

## Amostrando o conjunto de treinamento, validação e teste

In [10]:
from datasets import DatasetDict

In [11]:
train = train.shuffle(seed=42)

In [12]:
sample_train = train.select(range(0, 500))
sample_valid = train.select(range(500, 1000))

In [13]:
dataset = DatasetDict({'train' : sample_train, 'valid' : sample_valid, 'test' : sample_test})

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 500
    })
    valid: Dataset({
        features: ['label', 'text'],
        num_rows: 500
    })
    test: Dataset({
        features: ['ID', 'text'],
        num_rows: 10000
    })
})

In [15]:
df_train, df_valid = pd.DataFrame(dataset['train']), pd.DataFrame(dataset['valid'])

In [16]:
df_train.head()

Unnamed: 0,label,text
0,4,I stalk this truck. I've been to industrial p...
1,2,"who really knows if this is good pho or not, i..."
2,4,I LOVE Bloom Salon... all of their stylist are...
3,0,"We were excited to eat here, it is difficult t..."
4,2,"So this is a place, with food. That much canno..."


# BERT

In [17]:
import torch
from transformers import AutoModel
from transformers import AutoTokenizer

In [18]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 1. Extrator de Características

In [19]:
import numpy as np

In [20]:
def hidden_states(data):

    hidden = []
    for idx, row in data.iterrows():
        
        inputs = tokenizer(row['text'], return_tensors='pt', padding=True, truncation=True)

        # CLS token
        with torch.no_grad():
            outputs = model(**inputs).last_hidden_state[:,0]
        hidden.append(outputs[0].numpy())

    return hidden

## Conjuntos de treinamento e teste com o estado oculto

In [21]:
df_train_hidden = hidden_states(df_train)
df_test_hidden = hidden_states(df_test)

In [22]:
X_train, y_train = np.array(df_train_hidden), np.array(dataset['train']['label'])
X_test = np.array(df_test_hidden)

## Regressão logística

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)

In [26]:
pred = pd.DataFrame({'label': y_pred}, index=df_test.ID)

In [27]:
pred.to_csv('logr-pred.csv')

# 2. Fine-Tune

In [28]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

2024-04-23 18:14:40.784654: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [30]:
def encode(dataset):
  return dataset.map(tokenize, batched=True, batch_size=None)

In [31]:
dataset_encode = encode(dataset)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [32]:
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [34]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

  return torch._C._cuda_getDeviceCount() > 0


In [35]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=dataset_encode["train"],
                  eval_dataset=dataset_encode["valid"],
                  compute_metrics=compute_metrics,
                  tokenizer=tokenizer)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.195143,0.446
2,No log,1.039136,0.526
3,No log,1.027927,0.564


TrainOutput(global_step=189, training_loss=1.0533996259093916, metrics={'train_runtime': 554.9138, 'train_samples_per_second': 2.703, 'train_steps_per_second': 0.341, 'total_flos': 198711728640000.0, 'train_loss': 1.0533996259093916, 'epoch': 3.0})

In [37]:
predictions = trainer.predict(dataset_encode["test"])
y_pred = predictions.predictions.argmax(axis=1)

In [38]:
pred = pd.DataFrame({'label': y_pred}, index=df_test.ID)

In [39]:
pred.to_csv('finetuning-pred.csv')