# Práctica Distilbert
El objetivo de la práctica es desarrollar un sistema para el análisis de sentimiento sobre el dataset dataset **SST-2** (dos clases positiva y negativa). 
El sistema debe estar basado en un modelo transformer como BERT. 
Se pide enconcreto que compares los dos modelos BERT y DistilBERT, tanto en sus resultados como en el tiempo de entrenamiento. 


In [None]:
!pip install datasets transformers evaluate

## Data


In [None]:
from datasets import load_dataset
dataset = load_dataset("glue", "sst2")
dataset

In [None]:
import random
for i in range(5):
    index = random.randint(0,dataset["train"].num_rows)
    print("sentence:", dataset["train"][index]["sentence"])
    print("label:", dataset["train"][index]["label"])
    print()
     

In [None]:
from transformers import AutoTokenizer
model_name='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
def tokenize(example):
    return tokenizer(example["sentence"], truncation=True)

encoded_dataset = dataset.map(tokenize, batched=True)
encoded_dataset

Podemos ver que el tokenizador únicamente ha creado los campos: **input_ids** and **attention_mask**, pero no utiliza el **token_type_ids** porque distilbert no es entrenado para la tarea de Next Prediction Sentence. 


## Model


In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 2 # because this is a binary text classification task 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='./outputs/',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # 5, we changed to 1 for a faster training. You should increase its value to 3 or 5
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",   
)



In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(eval_predictions):
    predictions, labels = eval_predictions
    #     predictions = np.argmax(predictions, axis=1)
    y_pred = predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, y_pred, average='macro')
    acc = accuracy_score(labels, y_pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
from transformers import Trainer

trainer = Trainer(
    model,  # the model
    args,   # the arguments of the model
    train_dataset=encoded_dataset["train"], # the training dataset
    eval_dataset=encoded_dataset["validation"], #the validation dataset
    tokenizer=tokenizer,    # the tokenizer
    compute_metrics=compute_metrics # the metrics for obtain the metrics on the evaluation
)
# training
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
index_random = random.randint(0,encoded_dataset['test'].num_rows)
print(encoded_dataset['test'][index_random]['sentence'])
print(encoded_dataset['test'][index_random]['label'])

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    # return probs.argmax() is a tensor. We have to return its item
    return probs.argmax().item()

y_pred=[get_prediction(text) for text in encoded_dataset["test"]["sentence"]]


In [None]:
for i in range(10):
    print(encoded_dataset['test'][i]['sentence'])
    print('Predicted label: ', y_pred[i])
    print()