In [24]:
from transformers import AutoTokenizer
from pysentimiento.preprocessing import preprocess_tweet

In [37]:
from datasets import load_dataset

ds = load_dataset("cardiffnlp/tweet_sentiment_multilingual", "spanish")

ds

Found cached dataset tweet_sentiment_multilingual (/home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1839
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 324
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 870
    })
})

In [43]:
ds["train"][:10]

{'text': ['estoy hasta el ojete de que me digáis que tengo cara de mala leche',
  '@user Por?  Tenía pensado verla después de la segunda de Daredevil',
  'Esto de estar feliz mola',
  'Ya no es tan divertido',
  '@user te recuerdo que soy una persona que tiene criterio, equivocado, pero lo tengo',
  '@user @user @user con una pequeña donación hará felices a miles de chicas que no tienen  #asociacionmariloli',
  'He probado una nueva espuma para el pelo y sí que lo deja más rizado pero se queda como efecto gomina y no me gusta.',
  '@user Ojalá pudiera darte el abrazo en vivo o al menos acompañarte hoy. Siento mucho lo de tu primo',
  '@user aquí tienes a mi bae aka una egipcia preciosa que esta aprendiendo español. Os llevareis bien  @user',
  '@user ya somos dos, que triste'],
 'label': [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]}

In [26]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "pysentimiento/robertuito-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128


In [27]:
from pysentimiento.preprocessing import preprocess_tweet
preprocessed_ds = ds.map(lambda ex: {"text": preprocess_tweet(ex["text"], lang="es")})

Loading cached processed dataset at /home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-63d17802209172be.arrow
Loading cached processed dataset at /home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-383aafcdcf8ebb9e.arrow
Loading cached processed dataset at /home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-085d67da889331b0.arrow


In [28]:
tokenized_ds = preprocessed_ds.map(
    lambda batch: tokenizer(batch["text"], padding=False, truncation=True),
    batched=True, batch_size=32
)

Loading cached processed dataset at /home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-378eb711774401ee.arrow
Loading cached processed dataset at /home/darkstar/.cache/huggingface/datasets/cardiffnlp___tweet_sentiment_multilingual/spanish/0.1.0/936afd3cde120393429606f681b3b48d526873c45114068973f71e296ce80605/cache-8f0e3e6aba12a471.arrow


Map:   0%|          | 0/870 [00:00<?, ? examples/s]

In [10]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

training_args = TrainingArguments(
    per_device_train_batch_size=32,
    output_dir="test_trainer",
    do_eval=True,
    evaluation_strategy="epoch",
    num_train_epochs=5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()

{'eval_loss': 0.6657280921936035, 'eval_f1': 0.6854166666666667, 'eval_recall': 0.6944444444444445, 'eval_runtime': 0.5241, 'eval_samples_per_second': 618.152, 'eval_steps_per_second': 78.223, 'epoch': 1.0}
{'eval_loss': 0.6887546181678772, 'eval_f1': 0.703498617088025, 'eval_recall': 0.7037037037037037, 'eval_runtime': 0.5168, 'eval_samples_per_second': 626.993, 'eval_steps_per_second': 79.342, 'epoch': 2.0}
{'eval_loss': 0.8236162066459656, 'eval_f1': 0.7164104110139684, 'eval_recall': 0.7160493827160495, 'eval_runtime': 0.5185, 'eval_samples_per_second': 624.84, 'eval_steps_per_second': 79.069, 'epoch': 3.0}
{'eval_loss': 0.929192841053009, 'eval_f1': 0.6873083072248063, 'eval_recall': 0.691358024691358, 'eval_runtime': 0.5157, 'eval_samples_per_second': 628.321, 'eval_steps_per_second': 79.51, 'epoch': 4.0}
{'eval_loss': 0.9839525818824768, 'eval_f1': 0.6982156821523935, 'eval_recall': 0.7037037037037037, 'eval_runtime': 0.5217, 'eval_samples_per_second': 621.038, 'eval_steps_per_s

TrainOutput(global_step=290, training_loss=0.33816544105266705, metrics={'train_runtime': 47.4919, 'train_samples_per_second': 193.612, 'train_steps_per_second': 6.106, 'train_loss': 0.33816544105266705, 'epoch': 5.0})

In [11]:
trainer.evaluate(tokenized_ds["test"])

{'eval_loss': 1.0039308071136475, 'eval_f1': 0.7127589431985847, 'eval_recall': 0.7160919540229885, 'eval_runtime': 1.3649, 'eval_samples_per_second': 637.413, 'eval_steps_per_second': 79.86, 'epoch': 5.0}


{'eval_loss': 1.0039308071136475,
 'eval_f1': 0.7127589431985847,
 'eval_recall': 0.7160919540229885,
 'eval_runtime': 1.3649,
 'eval_samples_per_second': 637.413,
 'eval_steps_per_second': 79.86,
 'epoch': 5.0}

In [29]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model.cuda()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [30]:

encoded_input = tokenizer("Esta pelicula tuvo una horrible y fea trama", return_tensors='pt').input_ids
encoded_input = encoded_input.to('cuda')


In [31]:
#model = model.to(device)
encoded_input

tensor([[    0,   580, 12327,  4009,   531,  3973,   445,  5900, 15368,     2]],
       device='cuda:0')

In [33]:
logits = model(encoded_input).logits
logits

tensor([[ 0.2942,  0.0146, -0.1032]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [34]:
print(model.config.id2label)

{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}


In [35]:
print(f'logits [NEGATIVE, NEUTRAL, POSITIVE]: {logits.tolist()[0]}')

logits [NEGATIVE, NEUTRAL, POSITIVE]: [0.29424479603767395, 0.014559179544448853, -0.1031925305724144]


In [36]:
# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [NEGATIVE, NEUTRAL, POSITIVE]: {probabilities}')

probabilities [NEGATIVE, NEUTRAL, POSITIVE]: [0.4118511974811554, 0.31136828660964966, 0.27678048610687256]


In [19]:
output = output[0][0].detach()

scores = output.numpy()
scores = softmax(scores)

NameError: name 'output' is not defined