<a href="https://colab.research.google.com/github/javier-manas/tfg/blob/main/modelo-emotions-funcional-multiclase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Activate GPU and Install Dependencies

In [3]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [4]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

True

#2. Preprocess data

In [28]:
# Load data
from datasets import load_dataset
imdb = load_dataset("SetFit/emotion")





  0%|          | 0/3 [00:00<?, ?it/s]

In [29]:
# Create a smaller training dataset for faster training times
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])
print(small_train_dataset[0])
print(small_test_dataset[0])



{'text': 'while cycling in the country', 'label': 4, 'label_text': 'fear'}
{'text': 'i was feeling really troubled and down over what my dad said', 'label': 0, 'label_text': 'sadness'}


In [30]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [31]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [32]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [39]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=6)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

In [40]:
# Define the evaluation metrics 
import numpy as np
from datasets import load_metric
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average='macro')["f1"]
    
    return {"accuracy": accuracy, "f1": f1}

In [41]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "prueba_random_25-04"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/prueba_random_25-04 is already a clone of https://huggingface.co/mrovejaxd/prueba_random_25-04. Make sure you pull the latest changes with `repo.git_pull()`.


In [42]:
print(imdb['test'][2])
print()

print(imdb['train'][2])
print()
imdb['train'].features

{'text': 'i never make her separate from me because i don t ever want her to feel like i m ashamed with her', 'label': 0, 'label_text': 'sadness'}

{'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3, 'label_text': 'anger'}



{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'label_text': Value(dtype='string', id=None)}

In [43]:
# Train the model
trainer.train()



Step,Training Loss


TrainOutput(global_step=376, training_loss=0.9422260040932513, metrics={'train_runtime': 71.682, 'train_samples_per_second': 83.703, 'train_steps_per_second': 5.245, 'total_flos': 74085607364832.0, 'train_loss': 0.9422260040932513, 'epoch': 2.0})

In [44]:
# Compute the evaluation metrics
trainer.evaluate()

{'eval_loss': 0.5796199440956116,
 'eval_accuracy': 0.8266666666666667,
 'eval_f1': 0.6261654163666505,
 'eval_runtime': 2.4072,
 'eval_samples_per_second': 124.627,
 'eval_steps_per_second': 7.893,
 'epoch': 2.0}

# 4. Analyzing new data with the model

In [45]:
# Upload the model to the Hub
trainer.push_to_hub()

Upload file runs/Apr25_03-35-42_7461558e5fbe/events.out.tfevents.1682393745.7461558e5fbe.5452.6: 100%|########…

Upload file runs/Apr25_03-35-42_7461558e5fbe/events.out.tfevents.1682393819.7461558e5fbe.5452.8: 100%|########…

To https://huggingface.co/mrovejaxd/prueba_random_25-04
   afdf6e4..1bf6c4e  main -> main

   afdf6e4..1bf6c4e  main -> main

To https://huggingface.co/mrovejaxd/prueba_random_25-04
   1bf6c4e..3a968cd  main -> main

   1bf6c4e..3a968cd  main -> main



'https://huggingface.co/mrovejaxd/prueba_random_25-04/commit/1bf6c4efbe9403b6f212ca88e29137ea6da9d2bd'

In [48]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="mrovejaxd/prueba_random_25-04")

sentiment_model(["I like you. I love you"])

[{'label': 'LABEL_1', 'score': 0.5461065769195557}]