# Modelos do Hugging Face 

In [1]:
import random
import numpy as np
import pandas as pd

In [2]:
random.seed(42)

In [3]:
train = pd.read_csv('train.csv', index_col=0)
X_train, y_train = train['text'], train['target']

In [4]:
test = pd.read_csv('test.csv', index_col=0)
X_test = test['text']

## Usando Pipeline

In [5]:
!pip install -q transformers

In [6]:
from transformers import pipeline

2024-04-07 18:12:27.678167: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
sentiment = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [8]:
y_pred = sentiment(X_test.to_list())

In [9]:
y_pred = [0 if i['label'] == 'NEGATIVE' else 4 for i in y_pred]

In [10]:
pred = pd.DataFrame({'target': y_pred}, index=X_test.index)

In [11]:
pred.to_csv('pipeline-pred.csv')

## Usando AutoModel

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [13]:
model_name = 'Seethal/sentiment_analysis_generic_dataset'

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [15]:
pred = []X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
for i in X_test.to_list():
    sample = tokenizer(i, padding=True, return_tensors="pt", truncation=True)
    outputs = model(**sample)
    pred.append(outputs.logits.argmax().tolist())

In [16]:
y_pred = [4 if i == 1 else 0 for i in pred]

In [17]:
pred = pd.DataFrame({'target': y_pred}, index=X_test.index)

In [18]:
pred.to_csv('automodel-pred.csv')

## Fazendo o Fine-tuning de um AutoModel

In [19]:
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [21]:
y_train = [1 if i == 4 else 0 for i in y_train.to_list()]
y_val = [1 if i == 4 else 0 for i in y_val.to_list()]

In [22]:
df_train = pd.DataFrame({"text": X_train.to_list(), "label": y_train})
df_val = pd.DataFrame({"text": X_val.to_list(), "label": y_val})
df_test = pd.DataFrame({"text": X_test.to_list()})

In [23]:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

In [24]:
datasets = DatasetDict({'train': dataset_train, 'val': dataset_val, 'test':dataset_test})

In [25]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=80, return_tensors="pt")

In [28]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/56000 [00:00<?, ? examples/s]

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [29]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))

In [30]:
!pip install -q evaluate

In [31]:
import evaluate

In [32]:
metric = evaluate.load("accuracy")

In [33]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [34]:
from transformers import TrainingArguments, Trainer

In [35]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

  return torch._C._cuda_getDeviceCount() > 0


In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=tokenized_datasets["val"],
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.526419,0.753571
2,No log,0.614517,0.764214
3,No log,0.84546,0.776214


TrainOutput(global_step=375, training_loss=0.4108604736328125, metrics={'train_runtime': 1038.4252, 'train_samples_per_second': 2.889, 'train_steps_per_second': 0.361, 'total_flos': 123333307200000.0, 'train_loss': 0.4108604736328125, 'epoch': 3.0})

In [38]:
predictions = trainer.predict(tokenized_datasets["test"])
y_pred = predictions.predictions.argmax(axis=1)

In [39]:
y_pred = [4 if i == 1 else 0 for i in y_pred]

In [40]:
pred = pd.DataFrame({'target': y_pred}, index=X_test.index)

In [41]:
pred.to_csv('finetuning-pred.csv')