## In this example i will show how to fine-turn BERT model for multiclass text classification.

First we will import the library, we will use pytorch and keras, transformers for BERT and ML libraries.

In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences

#!pip install transformers==4.11.3
import transformers
print(transformers.__version__)
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import time
import random

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import matthews_corrcoef

4.11.3


check if there is a gpu available

In [2]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060


load the data, for this example we will use two datasets - one for train and validation and other for test the model

In [5]:
df = pd.read_csv('../bases/train_comentarios.txt', sep='\t', encoding='latin1')  

df_train=df.sample(frac=0.8,random_state=1977) #random state is a seed value
print(df_train.shape)

df_test=df.drop(df_train.index)
print(df_test.shape)

df.tail()

(2423, 16)
(606, 16)


Unnamed: 0,contrato,encuesta,mes,BASE_FILTRO_FLOW,nps,nps_group,factor_influencia,causa_raiz,motivo,inconveniente,incoveniente_cual,tab1,label,tab1_old,label_old,tab2
3024,10279046,19/2/2022 20:11,202202,BOX_CON_APP,0,Detractor,La Atención al Cliente,Relacionamiento,mal servicio,Sí,Se corta/hay intermitencias en la reproducción,servicio,1,servicio,7,cortes
3025,79606057,20/2/2022 19:59,202202,BOX_CON_APP,3,Detractor,Calidad y funcionamiento del servicio,Servicio,"el servicio de ""fibertel/flow/personal "" es ma...",Sí,Inconvenientes en el servicio de internet,conveniencia,4,precio,4,
3026,46986426,20/2/2022 23:23,202202,BOX_SIN_APP,5,Detractor,Calidad y funcionamiento del servicio,Servicio,no está andando bien el deco,Sí,Se corta/hay intermitencias en la reproducción...,servicio,1,servicio,7,cortes
3027,12674467,21/2/2022 13:14,202202,BOX_CON_APP,5,Detractor,Calidad y funcionamiento del servicio,Servicio,"se interrumpe mucho , se cuelga",Sí,Se corta/hay intermitencias en la reproducción...,servicio,1,servicio,7,cuelgue deco
3028,12472866,21/2/2022 14:33,202202,BOX_CON_APP,6,Detractor,Las características del producto (reiniciar pr...,Funcionalidad,"se inhabilita solo, hay que resetearlo casi di...",Sí,Arroja mensaje de error al momento del intenta...,servicio,1,servicio,7,cuelgue deco


At first we will explore the adventages of BERT. This model is pre-trained and have tokenizers. We will check the differences between Bert (original)  and Bert (Multilanguage) 

In [6]:
tokenizerBERT = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizerMULTI = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
#tokenizerBETO = BertTokenizer.from_pretrained("finiteautomata/beto-sentiment-analysis", do_lower_case=True)

In [7]:
#send the variables to a list to iterate

labels = df_train['label'] ##etiquetas
sentences = df_train['motivo'] ##texto crudo

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

print ("Original Text")
print (sentences[14])

tokenizedBERT_texts = [tokenizerBERT.tokenize(sent) for sent in sentences]
print ("Tokenize BERT BASE")
print (tokenizedBERT_texts[14])

tokenizerMULTI_texts = [tokenizerMULTI.tokenize(sent) for sent in sentences]
print ("Tokenize BERT MULTI")
print (tokenizerMULTI_texts[14])


Original Text
[CLS] repeticion permanente de peliculas [SEP]
Tokenize BERT BASE
['[CLS]', 'rep', '##etic', '##ion', 'permanent', '##e', 'de', 'pe', '##lic', '##ula', '##s', '[SEP]']
Tokenize BERT MULTI
['[CLS]', 'rep', '##eti', '##cion', 'permanente', 'de', 'peli', '##cula', '##s', '[SEP]']


We check that the spanish tokenizer works better. So we decide to use this model.

In [17]:
num_labels=df['label'].nunique()

#tokenizer = BertTokenizer.from_pretrained("finiteautomata/beto-sentiment-analysis", do_lower_case=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

#model = BertForSequenceClassification.from_pretrained("finiteautomata/beto-sentiment-analysis", num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

if we want to add more than one layer, we can create a class and use it as a model. In this ustomized model, i will add a drop out and a dense layer on top of bert to get the final output for the model. 

In [18]:
num_labels=df['label'].nunique()

#num_labels=df['label'].nunique() #number of categories i want to predict
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

#class BERTClass(torch.nn.Module):
#    def __init__(self):
#        super(BERTClass, self).__init__()
#        self.bert_model = transformers.BertModel.from_pretrained('finiteautomata/beto-sentiment-analysis', return_dict=True)
#        self.pre_classifier = torch.nn.Linear(768, 768)
#        self.dropout = torch.nn.Dropout(0.3)
#        self.classifier = torch.nn.Linear(768, num_labels)
#
#    def forward(self, input_ids, attention_mask):
#        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
#        hidden_state = output_1[0]
#        pooler = hidden_state[:, 0]
#        pooler = self.pre_classifier(pooler)
#        pooler = torch.nn.ReLU()(pooler)
#        pooler = self.dropout(pooler)
#        output = self.classifier(pooler)
#        return output
#    
#model = BERTClass()
#device = 'cpu'
#model.to(device)



We already have our model created, we have to build the tensors to use in the model.

In [19]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 64

# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(input_ids[0])
#print(input_ids)
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
#input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]


#input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
#print("segundo", input_ids)
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

    


[  101 10285 12229 10972 11822 10192 12820 44480 10115 10125 83246   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [22]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,random_state=2018, test_size=0.1)


train_labels=train_labels.to_numpy() #si lo calculo con cpu (en vez de cuda estas 2 lineas no van)
validation_labels=validation_labels.to_numpy() #si lo calculo con cpu (en vez de cuda estas 2 lineas no van)


# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [23]:

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=1e-05,
                       correct_bias=False)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)





We send the model to cuda

In [12]:
device = torch.device("cuda")
model.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

finally, we train the model

In [13]:
random.seed(1977)

train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    #loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss[0].item())
    # Backward pass
    loss[0].backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update tracking variables
    tr_loss += loss[0].item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits and labels to CPU
    logits = logits.logits.detach().cpu().numpy()
    #logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))



print("Train loss: {}".format(tr_loss/nb_tr_steps))


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.13017887140141568


Epoch:  33%|███▎      | 1/3 [00:26<00:52, 26.08s/it]

Validation Accuracy: 0.8125
Train loss: 0.08981668512017404


Epoch:  67%|██████▋   | 2/3 [00:52<00:26, 26.10s/it]

Validation Accuracy: 0.80078125
Train loss: 0.04999378030222371


Epoch: 100%|██████████| 3/3 [01:18<00:00, 26.08s/it]

Validation Accuracy: 0.81640625
Train loss: 0.04999378030222371





vamos a analizar la performance con la cpu

In [25]:
#ejecuto el mismo modelo desde el cpu

device = torch.device("cpu")
model.to('cpu')

random.seed(1977)

train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    #loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss[0].item())
    # Backward pass
    loss[0].backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update tracking variables
    tr_loss += loss[0].item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits and labels to CPU
    logits = logits.logits.detach().cpu().numpy()
    #logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))



print("Train loss: {}".format(tr_loss/nb_tr_steps))


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 1.5925307239804949


Epoch:  33%|███▎      | 1/3 [02:19<04:38, 139.29s/it]

Validation Accuracy: 0.5877403846153846
Train loss: 1.2735571963446481


Epoch:  67%|██████▋   | 2/3 [04:29<02:14, 134.02s/it]

Validation Accuracy: 0.7668269230769231
Train loss: 0.8299980776650565


Epoch: 100%|██████████| 3/3 [06:51<00:00, 137.06s/it]

Validation Accuracy: 0.8052884615384616
Train loss: 0.8299980776650565





Dan un resultado similar, solo que la funcion de perdida es mucho mayor. ¿Que significa?

Once the model is trained. We can test it again a new dataset to see the performance. 

In [14]:
###EN ESTA PARTE PROCESAMOS LOS DATOS NUEVOS PARA DARLE EL FORMATO NECESARIO PARA PYTORCH###

# Create sentence and label lists
sentences = df_test.motivo.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = df_test.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

MAX_LEN = 64
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 16  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)



###AQUI COMIENZA LA PREDICCIÓN###

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)



In [16]:
matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)
    
  
# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print('matthews corrcoef: ',matthews_corrcoef(flat_true_labels, flat_predictions))



d = {'Labels':flat_true_labels,'Predictions':flat_predictions}
resumen = pd.DataFrame(d, columns=['Labels','Predictions'])


#print(pd.crosstab(resumen['Labels'], resumen['Predictions'] , dropna=False))

target_names = ['no lo uso', 'servicio', 'funcionalidad', 'contenido', 'conveniencia', 'relacionamiento', 'otros']
print(classification_report(flat_true_labels, flat_predictions, target_names=target_names))


matthews corrcoef:  0.7445032235619898
                 precision    recall  f1-score   support

      no lo uso       0.86      0.60      0.71        10
       servicio       0.89      0.89      0.89       305
  funcionalidad       0.66      0.74      0.70        77
      contenido       0.83      0.69      0.75        75
   conveniencia       0.89      0.90      0.89        88
relacionamiento       0.59      0.72      0.65        32
          otros       0.62      0.53      0.57        19

       accuracy                           0.82       606
      macro avg       0.76      0.72      0.74       606
   weighted avg       0.83      0.82      0.82       606



Lest check the performance of the model

we obtained a accuracy of 0.82. Lets explore the values:

In [17]:
#we can save the predictions
df_test['predictions']=flat_predictions

df_test.to_csv("../bases/testeo.csv", encoding='latin1', index=False, sep='\t')

In [18]:
# Saving the files for re-use

output_model_file = '../models/pytorch_textclasif.bin'
output_vocab_file = '../models/vocab_textclasif.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')


All files saved


In [29]:
import time

import torch
import pandas as pd
import numpy as np

from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


device='cpu'
model.to('cpu')

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

###  Once we have trained the model, we load it to apply to new data

In [20]:
output_model_file = '../models/pytorch_textclasif.bin'
output_vocab_file = '../models/vocab_textclasif.bin'

# Model class must be defined somewhere
model = torch.load(output_model_file)
tokenizer = BertTokenizer.from_pretrained(output_vocab_file)

df_test = pd.read_csv('../bases/train_comentarios.txt', sep='\t', encoding='latin1')  
df_test.shape

(3029, 16)

In [30]:
start_time = time.time()

# Create sentence and label lists
sentences = df_test.motivo.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = df_test.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

MAX_LEN = 64
# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 16  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

print("El tiempo de preprocesamiento fue: ",time.time() - start_time," segundos") 

###AQUI COMIENZA LA PREDICCIÓN###

start_time = time.time()
      
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.logits.detach().cpu().numpy()
  label_ids = b_labels.to(device).numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)



matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)
    
  
# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print('matthews corrcoef: ',matthews_corrcoef(flat_true_labels, flat_predictions))



d = {'Labels':flat_true_labels,'Predictions':flat_predictions}
resumen = pd.DataFrame(d, columns=['Labels','Predictions'])


#print(pd.crosstab(resumen['Labels'], resumen['Predictions'] , dropna=False))

target_names = ['no lo uso', 'servicio', 'funcionalidad', 'contenido', 'conveniencia', 'relacionamiento', 'otros']
print(classification_report(flat_true_labels, flat_predictions, target_names=target_names))

print("El tiempo de prediccion fue de: ",(time.time() - start_time) / 60," minutos")


#we can save the predictions
df_test['predictions']=flat_predictions

df_test.to_csv("../bases/base_predicciones.csv", encoding='latin1', index=False, sep='\t')


El tiempo de preprocesamiento fue:  0.9699265956878662  segundos
matthews corrcoef:  0.9243170064113807
                 precision    recall  f1-score   support

      no lo uso       0.98      0.90      0.94        49
       servicio       0.97      0.97      0.97      1575
  funcionalidad       0.89      0.90      0.89       333
      contenido       0.96      0.92      0.94       359
   conveniencia       0.96      0.98      0.97       415
relacionamiento       0.85      0.93      0.88       165
          otros       0.91      0.88      0.89       133

       accuracy                           0.95      3029
      macro avg       0.93      0.92      0.93      3029
   weighted avg       0.95      0.95      0.95      3029

El tiempo de prediccion fue:  137.50421452522278  segundos


In [27]:
#!pip3 install nlpaug

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [28]:
# Augment Spanish by BERT
aug_BERT = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-cased', aug_p=0.1)


In [None]:
aug_FT = naw.WordEmbsAug(model_type='fasttext', tokenizer=tokenizer, model_path='../models/cc.es.300.vec')

In [None]:
text = "La grabación no me funciona"
augmented_text_BERT = aug_BERT.augment(text)
augmented_text_FT = aug_FT.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text_BERT)
print(augmented_text_FT)