In [None]:
!pip install transformers



In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

In [9]:
#carga de datos

df = pd.read_csv('/content/Dataset Final - Copia de dataset_1279.csv')
df.rename({'tweet': 'text', 'clase':'label'}, axis=1, inplace=True)
#df.drop('Unnamed: 0', axis=1, inplace=True)

df.head()

Unnamed: 0,date,time,text,label
0,2023-01-01,0:00:00,"Resuelto, muchísimas gracias , excelente servi...",0
1,2023-01-02,0:00:00,"Muchas gracias, espero su dm",1
2,2023-01-02,0:00:00,Muchas gracias!,0
3,2023-01-02,0:00:00,Algo similar me paso. Quería renovar mi token ...,0
4,2023-01-02,0:00:00,Yeeeei! a través de mi cuenta en acabo de cont...,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1616 entries, 0 to 1615
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    1616 non-null   object
 1   time    1616 non-null   object
 2   text    1616 non-null   object
 3   label   1616 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 50.6+ KB


In [11]:
#extracción de columnas

text = df.text.values
labels = df.label.values

In [12]:
#preprocesamiento

tokenizer = BertTokenizer.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-uncased',
    do_lower_case = True
    )


token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
batch_size = 16
val_ratio = 0.2

train_idx, val_idx = train_test_split(
            np.arange(len(labels)),
            test_size = val_ratio,
            shuffle = True,
            stratify = labels )

train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                          attention_masks[val_idx],
                          labels[val_idx])

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [14]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

In [15]:
from transformers import AutoModelForMaskedLM, BertConfig

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load pre-trained BERT model
#model = AutoModelForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-uncased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.to(device)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [16]:
train_dataloader.dataset[0]

(tensor([    4,  1111, 30984,  1207,  1039, 14774,  1009, 18433,  9364,  1985,
          1170,  1360,  7846, 17947,     3,  2279,  1431,  1431,  1431,  1054,
          1019,  1542,     3,  1096, 13513, 13513,  1359,  6186, 30957,  1109,
          1109,     5]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]),
 tensor(0))

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 25

for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')


Epoch:   4%|▍         | 1/25 [00:12<04:56, 12.34s/it]


	 - Train loss: 0.8296
	 - Validation Accuracy: 0.5387
	 - Validation Precision: 0.7267
	 - Validation Recall: 0.5550
	 - Validation Specificity: 0.8945



Epoch:   8%|▊         | 2/25 [00:21<04:01, 10.51s/it]


	 - Train loss: 0.4300
	 - Validation Accuracy: 0.5149
	 - Validation Precision: 0.6757
	 - Validation Recall: 0.7606
	 - Validation Specificity: 0.7988



Epoch:  12%|█▏        | 3/25 [00:30<03:38,  9.94s/it]


	 - Train loss: 0.1732
	 - Validation Accuracy: 0.4881
	 - Validation Precision: 0.6608
	 - Validation Recall: 0.6839
	 - Validation Specificity: 0.8501



Epoch:  16%|█▌        | 4/25 [00:40<03:24,  9.72s/it]


	 - Train loss: 0.1336
	 - Validation Accuracy: 0.4554
	 - Validation Precision: 0.6737
	 - Validation Recall: 0.8397
	 - Validation Specificity: 0.7435



Epoch:  20%|██        | 5/25 [00:49<03:12,  9.62s/it]


	 - Train loss: 0.0912
	 - Validation Accuracy: 0.5149
	 - Validation Precision: 0.6833
	 - Validation Recall: 0.6527
	 - Validation Specificity: 0.8384



Epoch:  24%|██▍       | 6/25 [00:59<03:02,  9.60s/it]


	 - Train loss: 0.0477
	 - Validation Accuracy: 0.4970
	 - Validation Precision: 0.6885
	 - Validation Recall: 0.7301
	 - Validation Specificity: 0.8111



Epoch:  28%|██▊       | 7/25 [01:08<02:53,  9.62s/it]


	 - Train loss: 0.0246
	 - Validation Accuracy: 0.5327
	 - Validation Precision: 0.6477
	 - Validation Recall: 0.7262
	 - Validation Specificity: 0.7554



Epoch:  32%|███▏      | 8/25 [01:18<02:44,  9.70s/it]


	 - Train loss: 0.0602
	 - Validation Accuracy: 0.5417
	 - Validation Precision: 0.6931
	 - Validation Recall: 0.5669
	 - Validation Specificity: 0.8165



Epoch:  36%|███▌      | 9/25 [01:28<02:36,  9.78s/it]


	 - Train loss: 0.0610
	 - Validation Accuracy: 0.4851
	 - Validation Precision: 0.6927
	 - Validation Recall: 0.6743
	 - Validation Specificity: 0.8265



Epoch:  40%|████      | 10/25 [01:38<02:27,  9.82s/it]


	 - Train loss: 0.0456
	 - Validation Accuracy: 0.5238
	 - Validation Precision: 0.7763
	 - Validation Recall: 0.4585
	 - Validation Specificity: 0.9355



Epoch:  44%|████▍     | 11/25 [01:48<02:18,  9.87s/it]


	 - Train loss: 0.0495
	 - Validation Accuracy: 0.3661
	 - Validation Precision: 0.8889
	 - Validation Recall: 0.5298
	 - Validation Specificity: 0.9873



Epoch:  48%|████▊     | 12/25 [01:58<02:07,  9.83s/it]


	 - Train loss: 0.0828
	 - Validation Accuracy: 0.5327
	 - Validation Precision: 0.6540
	 - Validation Recall: 0.7408
	 - Validation Specificity: 0.7603



Epoch:  52%|█████▏    | 13/25 [02:08<01:57,  9.79s/it]


	 - Train loss: 0.0117
	 - Validation Accuracy: 0.5030
	 - Validation Precision: 0.7157
	 - Validation Recall: 0.6935
	 - Validation Specificity: 0.8436



Epoch:  56%|█████▌    | 14/25 [02:17<01:47,  9.81s/it]


	 - Train loss: 0.0049
	 - Validation Accuracy: 0.5000
	 - Validation Precision: 0.6790
	 - Validation Recall: 0.6606
	 - Validation Specificity: 0.8441



Epoch:  60%|██████    | 15/25 [02:27<01:37,  9.76s/it]


	 - Train loss: 0.0034
	 - Validation Accuracy: 0.4970
	 - Validation Precision: 0.6743
	 - Validation Recall: 0.6813
	 - Validation Specificity: 0.8237



Epoch:  64%|██████▍   | 16/25 [02:37<01:27,  9.74s/it]


	 - Train loss: 0.0317
	 - Validation Accuracy: 0.4702
	 - Validation Precision: 0.6611
	 - Validation Recall: 0.7643
	 - Validation Specificity: 0.8105



Epoch:  68%|██████▊   | 17/25 [02:46<01:17,  9.74s/it]


	 - Train loss: 0.0345
	 - Validation Accuracy: 0.5089
	 - Validation Precision: 0.6967
	 - Validation Recall: 0.6499
	 - Validation Specificity: 0.8316



Epoch:  72%|███████▏  | 18/25 [02:56<01:08,  9.74s/it]


	 - Train loss: 0.0446
	 - Validation Accuracy: 0.5208
	 - Validation Precision: 0.6190
	 - Validation Recall: 0.7573
	 - Validation Specificity: 0.7319



Epoch:  76%|███████▌  | 19/25 [03:06<00:58,  9.75s/it]


	 - Train loss: 0.0282
	 - Validation Accuracy: 0.5030
	 - Validation Precision: 0.6131
	 - Validation Recall: 0.7701
	 - Validation Specificity: 0.7078



Epoch:  80%|████████  | 20/25 [03:16<00:48,  9.76s/it]


	 - Train loss: 0.0359
	 - Validation Accuracy: 0.5268
	 - Validation Precision: 0.6307
	 - Validation Recall: 0.7476
	 - Validation Specificity: 0.7478



Epoch:  84%|████████▍ | 21/25 [03:26<00:39,  9.76s/it]


	 - Train loss: 0.0048
	 - Validation Accuracy: 0.5060
	 - Validation Precision: 0.6564
	 - Validation Recall: 0.7649
	 - Validation Specificity: 0.7839



Epoch:  88%|████████▊ | 22/25 [03:35<00:29,  9.76s/it]


	 - Train loss: 0.0087
	 - Validation Accuracy: 0.5179
	 - Validation Precision: 0.6696
	 - Validation Recall: 0.7965
	 - Validation Specificity: 0.7829



Epoch:  92%|█████████▏| 23/25 [03:45<00:19,  9.76s/it]


	 - Train loss: 0.0065
	 - Validation Accuracy: 0.4881
	 - Validation Precision: 0.6713
	 - Validation Recall: 0.8218
	 - Validation Specificity: 0.7858



Epoch:  96%|█████████▌| 24/25 [03:55<00:09,  9.76s/it]


	 - Train loss: 0.0749
	 - Validation Accuracy: 0.5357
	 - Validation Precision: 0.6708
	 - Validation Recall: 0.5905
	 - Validation Specificity: 0.8240



Epoch: 100%|██████████| 25/25 [04:05<00:00,  9.80s/it]


	 - Train loss: 0.0306
	 - Validation Accuracy: 0.4702
	 - Validation Precision: 0.7406
	 - Validation Recall: 0.7477
	 - Validation Specificity: 0.8691






In [18]:
new_sentence = 'amo hey banco pero estoy insatisfecha con el servicio'
# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction = np.argmax(output.logits.cpu().numpy()).flatten().item()

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

Input Sentence:  amo hey banco pero estoy insatisfecha con el servicio
Predicted Class:  1




In [19]:
torch.save(model.state_dict(), 'heybot_model.pth')


In [20]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-uncased',
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)

model.load_state_dict(torch.load('heybot_model.pth'))
model.eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [21]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

new_sentence = 'amo hey banco pero estoy insatisfecha con el servicio'

def preprocessing(sentence, tokenizer):
    inputs = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return inputs

encoding = preprocessing(new_sentence, tokenizer)

with torch.no_grad():
    output = model(encoding['input_ids'], token_type_ids=None, attention_mask=encoding['attention_mask'])[0]

prediction = torch.argmax(output).item()

print('Input Sentence:', new_sentence)
print('Predicted Class:', prediction)


Input Sentence: amo hey banco pero estoy insatisfecha con el servicio
Predicted Class: 1


In [22]:
import torch
from transformers import BertForSequenceClassification

class BertModelWrapper:
    def __init__(self, model_path):
        self.model = BertForSequenceClassification.from_pretrained(
            'dccuchile/bert-base-spanish-wwm-uncased',
            num_labels=3,
            output_attentions=False,
            output_hidden_states=False,
        )
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

    def predict(self, new_sentence):
        encoding = self.preprocess(new_sentence)
        with torch.no_grad():
            output = self.model(encoding['input_ids'], token_type_ids=None, attention_mask=encoding['attention_mask'])[0]
        prediction = torch.argmax(output).item()
        return prediction

    def preprocess(self, sentence):
        tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')
        inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return inputs

if __name__ == "__main__":
    model_path = 'heybot_model.pth'
    bert_model = BertModelWrapper(model_path)
    new_sentence = 'amo hey banco pero estoy insatisfecha con el servicio'
    prediction = bert_model.predict(new_sentence)
    print('Input Sentence:', new_sentence)
    print('Predicted Class:', prediction)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input Sentence: amo hey banco pero estoy insatisfecha con el servicio
Predicted Class: 1
