In [3]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
import torch
from datasets import load_dataset
from pandas import read_csv
from sklearn.metrics import accuracy_score, f1_score


The data fields are:

text: a string feature.
label: a classification label, with possible values including sadness (0), joy (1), love (2), anger (3), fear (4), surprise (5).

In [10]:
#Parameters
dataset_name = "dair-ai/emotion"
model_ckpt = "distilbert-base-uncased"
model_name = f"{model_ckpt}-finetuned-emotion"

In [11]:
#Loading a dataset from the Hub
dataset = load_dataset(dataset_name) # Esta línea no funciona en Cousera, es para que hagan su descarga en Colab o en su entorno personal
print(dataset)
print(dataset['train'].features)
num_labels = dataset['train'].features['label'].num_classes
print(f"Number of labels: {num_labels}")

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}
Number of labels: 6


In [12]:
#Instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
#Tokenize dataset
def tokenize_function(example):
  #Tokenize the texts
  tokenized_batch = tokenizer(example["text"], padding = True, truncation=True, max_length=256)
  #Add labels
  tokenized_batch["labels"] = example["label"]
  return tokenized_batch



tokenized_dataset = dataset.map(tokenize_function
                                     ,batched = True
                                     ,batch_size = None)
#batched = True for the function to be applied on multiple elements of our dataset at once, instead of on each element separately (i.e., faster preprocessing) (default: False).
#batch_size = None means that our tokenize function is applied on the full dataset as a single batch (default: 1000).

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
#Instantiate data collator
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

**Fine-tuning**

In [8]:
#Define device for fine-tuning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
#Define the mappings as dictionaries
label2id = {"sadness": 0
            ,"joy": 1
            ,"love": 2
            ,"anger": 3
            ,"fear": 4
            ,"surprise": 5}
id2label = {"0": "sadness"
            ,"1": "joy"
            ,"2": "love"
            ,"3": "anger"
            ,"4": "fear"
            ,"5": "surprise"}

#Define model configuration
config = AutoConfig.from_pretrained(model_ckpt
                                    ,label2id = label2id
                                    ,id2label = id2label)

In [10]:
#Instantiate pre-trained model with a sequence classification head
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt
                                                            ,config = config).to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#Define performance metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels
                  ,preds
                  ,average = 'weighted')
    acc = accuracy_score(labels
                         ,preds)
    return {'accuracy': acc, 'f1': f1}

In [12]:
!pip install accelerate -U



In [13]:
!pip install transformers[torch]



In [14]:
!pip install transformers[torch] --upgrade
!pip install accelerate --upgrade




In [15]:
import transformers
import accelerate
print("Transformers version: ", transformers.__version__)
print("Accelerate version: ", accelerate.__version__)


Transformers version:  4.35.2
Accelerate version:  0.24.1


In [18]:
from transformers import Trainer, TrainingArguments

#Instantiate TrainingArguments and Trainer
batch_size = 64
logging_steps = len(tokenized_dataset['train']) // batch_size
training_args = TrainingArguments(output_dir = "Results"
                                  ,seed = 42
                                  ,num_train_epochs = 1 #Start with 3 and see if validation loss keeps decreasing, once it inverts, stop at the epoch with the lowest validation loss since the model starts to overfit from that point onwards.
                                  ,optim = 'adamw_torch'
                                  ,learning_rate = 2e-5
                                  ,weight_decay = 0.01
                                  ,evaluation_strategy = 'epoch'
                                  ,per_device_train_batch_size = batch_size
                                  ,per_device_eval_batch_size = batch_size
                                  ,disable_tqdm = False
                                  ,logging_strategy = 'steps'
                                  ,logging_steps = logging_steps
                                  ,log_level = 'error'
                                  ,report_to = 'none'
                                  )
#output_dir: The output directory where the model predictions and checkpoints will be written.
#seed (default: 42):  Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the ~Trainer.model_init function to instantiate the model if it has some randomly initialized parameters.
#num_train_epochs (default: 3): Total number of training epochs to perform (if not an integer, will perform the decimal part percents of the last epoch before stopping training).
#optim (default: 'adamw_hf'): The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
#learning_rate (default: 5e-5): The initial learning rate for AdamW optimizer.
#weight_decay (default: 0): The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer.
#evaluation_strategy (default: 'no'): The evaluation strategy to adopt during training. Possible values are: i) "no": No evaluation is done during training; ii)"steps": Evaluation is done (and logged) every eval_steps; iii)"epoch": Evaluation is done at the end of each epoch.
#per_device_train_batch_size (default: 8): The batch size per GPU/TPU core/CPU for training.
#per_device_eval_batch_size (default: 8): The batch size per GPU/TPU core/CPU for evaluation.
#disable_tqdm: Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker in Jupyter Notebooks.
#logging_strategy (default: 'steps'): The logging strategy to adopt during training. Possible values are: i) "no": No logging is done during training; ii) "epoch": Logging is done at the end of each epoch; iii)"steps": Logging is done every logging_steps.
#logging_steps (default: 500): Number of update steps between two logs if logging_strategy="steps"
#log_level (default: 'passive'): Logger log level to use on the main process. Possible choices are the log levels as strings: ‘debug’, ‘info’, ‘warning’, ‘error’ and ‘critical’, plus a ‘passive’ level which doesn’t set anything and keeps the current log level for the Transformers library (which will be "warning" by default).
#report_to (default: 'all'): The list of integrations to report the results and logs to. Supported platforms are "azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "flyte", "mlflow", "neptune", "tensorboard", and "wandb". Use "all" to report to all integrations installed, "none" for no integrations.
#push_to_hub (default: False): Whether or not to push the model to the Hub every time the model is saved.

trainer = Trainer(model = model
                  ,tokenizer = tokenizer
                  ,data_collator = data_collator #When you pass the tokenizer, the default data_collator used by the Trainer will be a DataCollatorWithPadding, which is the one defined previously
                  ,args = training_args
                  ,compute_metrics = compute_metrics
                  ,train_dataset = tokenized_dataset["train"]
                  ,eval_dataset = tokenized_dataset["validation"])

In [19]:
#Execute fine-tunning
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


NameError: ignored

In [24]:
from sklearn.metrics import accuracy_score, f1_score

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [31]:
torch.save(model.state_dict(), "/content/gdrive/MyDrive/Colab_Notebooks/7a. Deep Learning/sentiment_analysis_model.pth")

In [32]:
# Guardar el modelo
trainer.save_model("/content/gdrive/MyDrive/Colab_Notebooks/7a. Deep Learning/sentiment_analysis_model2.pth")

In [25]:
#Metrics
preds_output = trainer.predict(tokenized_dataset["validation"])
print(preds_output.metrics)

Epoch,Training Loss,Validation Loss


{'test_loss': 0.2705562710762024, 'test_accuracy': 0.9095, 'test_f1': 0.9092680713276212, 'test_runtime': 247.2822, 'test_samples_per_second': 8.088, 'test_steps_per_second': 0.129}


# New Section

# New Section

# New Section

In [14]:
# Cargar tu modelo

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

#Tokenize dataset
def tokenize_function(example):
  #Tokenize the texts
  tokenized_batch = tokenizer(example["text"], padding = True, truncation=True, max_length=256)
  #Add labels
  tokenized_batch["labels"] = example["label"]
  return tokenized_batch


model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 6)
model.load_state_dict(torch.load("/content/gdrive/MyDrive/Colab_Notebooks/7a. Deep Learning/sentiment_analysis_model.pth", map_location=torch.device('cpu')))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [18]:
import datasets

eval_dataset = read_csv('test.csv')
_eval_dataset = datasets.Dataset.from_pandas(eval_dataset)

In [19]:
# Si hiciste alguno cambio distinto de lo propuesto abajo, cambiar como se procesa el set de testeo para obtener el mismo tipo de datos

test_dataset = _eval_dataset.map(tokenize_function, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [21]:
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)
model.eval()
predictions = []
real_labels = []

for batch in test_dataloader:
    batch = {k: v for k, v in batch.items()}
    # Acá si el formato que dejaste los batches es distinto, adecuar funcion
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)
    # NO EDITAR
    predictions.extend(predicted_labels.cpu().numpy())
    real_labels.extend(batch['labels'].cpu().numpy())


In [22]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(real_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(real_labels, predictions, average='weighted')
score =  (accuracy+precision+recall+f1)/4

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Score: {score}")

Accuracy: 0.9125
Precision: 0.9119078965819448
Recall: 0.9125
F1 Score: 0.9115839158231578
Score: 0.9121229531012756
