# Fine Tunning BETO for text classification
Based on: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894

In [None]:
!pip install transformers



In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# Import libraries:

import torch
from google.colab import drive
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import time


from tabulate import tabulate
from tqdm import trange
import random

drive.mount('/content/drive')


Mounted at /content/drive


# Load dataset:

In [None]:
# Reading file:
df_keywords = pd.read_csv('/content/drive/My Drive/NLP/ENTREGA FINAL/tagged_keywords_all.csv')

In [None]:
df_keywords.drop(['Unnamed: 0','RAZON_SOCIAL_PROCESADA'], axis=1, inplace=True)

In [None]:
df_keywords.columns

Index(['RAZON_SOCIAL', 'final_category'], dtype='object')

In [None]:
df_test_coverage = df_keywords[df_keywords['final_category']=="No categorizado"].sample(n=1000)
df_test_coverage.reset_index(inplace=True)
df_test_coverage.drop(['index'], axis=1, inplace=True)

In [None]:
df = df_keywords.copy()

In [None]:
df = df[df['final_category']!="No categorizado"]

In [None]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
df.final_category.nunique()

8

In [None]:
df.final_category.value_counts()

Inversiones                              14129
Servicios Profesionales y Consultoría    10908
Cuentas y servicios                      10643
Transferencia Bancaria                   10617
Entretenimiento                           8012
Transporte                                5906
Alimentación                              5031
Salud                                     2906
Name: final_category, dtype: int64

In [None]:
# We extract text and label values:
text = df.RAZON_SOCIAL.values
labels = df.final_category.values

In [None]:
labels

array(['Salud', 'Transferencia Bancaria',
       'Servicios Profesionales y Consultoría', ..., 'Inversiones',
       'Cuentas y servicios', 'Alimentación'], dtype=object)

# Preprocessing: Downloading BETO:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased'
)

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
encoder = LabelEncoder()
labels_encoded = encoder.fit_transform(labels)

In [None]:
# Tokenize:

token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels_encoded)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Data split into train and test:

In [None]:
#torch.bincount(labels)

In [None]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 16

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

# Training ans testing model:

In [None]:
def b_tp(preds, labels):
    unique_classes = np.unique(labels)
    return np.array([np.sum((preds == labels) & (preds == cls)) for cls in unique_classes])

def b_fp(preds, labels):
    unique_classes = np.unique(labels)
    return np.array([np.sum((preds != labels) & (preds == cls)) for cls in unique_classes])

def b_fn(preds, labels):
    unique_classes = np.unique(labels)
    return np.array([np.sum((preds != labels) & (labels == cls)) for cls in unique_classes])

def b_metrics(preds, labels):
    # Asegurarse de que preds y labels están en el mismo formato
    preds = np.argmax(preds, axis=1)  # Esto convierte las predicciones en un formato de clase única por muestra

    if labels.ndim > 1 and labels.shape[1] > 1:
        # Si labels es one-hot encoded, convertirlo a formato de clase única
        labels = np.argmax(labels, axis=1)

    num_classes = len(np.unique(labels))
    tp = b_tp(preds, labels)
    fp = b_fp(preds, labels)
    fn = b_fn(preds, labels)

    tn = np.zeros(num_classes)
    for cls in range(num_classes):
        tn[cls] = np.sum((preds != cls) & (labels != cls))

    b_accuracy = np.mean((tp + tn) / len(labels))
    b_precision = np.nanmean(np.divide(tp, tp + fp, out=np.zeros_like(tp, dtype=float), where=(tp + fp) != 0))
    b_recall = np.nanmean(np.divide(tp, tp + fn, out=np.zeros_like(tp, dtype=float), where=(tp + fn) != 0))
    b_specificity = np.nanmean(np.divide(tn, tn + fp, out=np.zeros_like(tn, dtype=float), where=(tn + fp) != 0))

    return b_accuracy, b_precision, b_recall, b_specificity


In [None]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',  # Modelo BETO preentrenado
    num_labels=8,
    output_attentions=False,
    output_hidden_states=False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 2e-5,
                              eps = 1e-08
                              )

# Run on GPU
model.cuda()

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 2

for _ in trange(epochs, desc = 'Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids,
                              token_type_ids = None,
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

In [None]:
new_sentence = 'INVERSIONES SAROC SAS'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Mueve el modelo al dispositivo
model.to(device)

# We need Token IDs and Attention Mask for inference on the new sentence
test_ids = []
test_attention_mask = []

# Apply the tokenizer
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0)
test_attention_mask = torch.cat(test_attention_mask, dim = 0)

test_ids = test_ids.to(device)
test_attention_mask = test_attention_mask.to(device)

# Forward pass
with torch.no_grad():
    output = model(test_ids, token_type_ids=None, attention_mask=test_attention_mask)


# Forward pass, calculate logit predictions
with torch.no_grad():
  output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

prediction_coded = np.argmax(output.logits.cpu().numpy()).flatten().item()
prediction_coded = np.array([prediction_coded])

decoded_label = encoder.inverse_transform(prediction_coded)

print('Input Sentence: ', new_sentence)
print('Etiqueta Decodificada:', decoded_label)

Input Sentence:  INVERSIONES SAROC SAS
Etiqueta Decodificada: ['Transporte']


# Load the model in Hugging Face:

In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer

model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model", use_auth_token=True)



('./my_model/tokenizer_config.json',
 './my_model/special_tokens_map.json',
 './my_model/vocab.txt',
 './my_model/added_tokens.json',
 './my_model/tokenizer.json')

In [None]:
model = BertForSequenceClassification.from_pretrained("./my_model")
tokenizer = AutoTokenizer.from_pretrained("./my_model")

In [None]:
from huggingface_hub import HfFolder

#HfFolder.save_token("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")

In [None]:
#model.push_to_hub("jonjimenez/BETO-categorizacion-pagos-espanol")
#tokenizer.push_to_hub("jonjimenez/BETO-categorizacion-pagos-espanol")

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jonjimenez/BETO-categorizacion-pagos-espanol/commit/e8a5204f2092698715d478aeb49131aa3b143d2b', commit_message='Upload tokenizer', commit_description='', oid='e8a5204f2092698715d478aeb49131aa3b143d2b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

print(label_mapping)

{'Alimentación': 0, 'Cuentas y servicios': 1, 'Entretenimiento': 2, 'Inversiones': 3, 'Salud': 4, 'Servicios Profesionales y Consultoría': 5, 'Transferencia Bancaria': 6, 'Transporte': 7}


# Guardando el modelo en joblib para publicarlo en un espacio en Hugging Face:

In [None]:
import joblib

joblib.dump(model, 'modelo_entrenado.joblib')
joblib.dump(tokenizer, 'tokenizer_entrenado.joblib')
joblib.dump(label_mapping, 'label_mapping.joblib')

['label_mapping.joblib']

# Test coverage with probability for each class:

In [None]:
def preprocessing(text, tokenizer):
    # Asumiendo que esta función ya está definida y devuelve el encoding del texto
    return tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')

In [None]:
import torch.nn.functional as F

def predict_with_probability(sentence):
    # Aplica la tokenización
    encoding = preprocessing(sentence, tokenizer)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
        output = model(**encoding)

    # Aplicar softmax para obtener probabilidades
    probabilities = torch.nn.functional.softmax(output.logits, dim=1).numpy().flatten()

    # Extraer la etiqueta con la máxima probabilidad
    prediction_coded = np.argmax(probabilities).item()
    max_probability = probabilities[prediction_coded]

    # Decodificar la etiqueta
    decoded_label = encoder.inverse_transform([prediction_coded])

    # Devolver etiqueta y su correspondiente probabilidad
    return decoded_label[0], max_probability

In [None]:
df_test_coverage = df_test_coverage.sample(100)

In [None]:
%%time

df_test_coverage[['Predicted_Label', 'Probability']] = df_test_coverage.apply(lambda row: predict_with_probability(row['RAZON_SOCIAL']), axis=1, result_type='expand')

CPU times: user 13.4 s, sys: 27.8 ms, total: 13.5 s
Wall time: 13.7 s


In [None]:
df_test_coverage['Predicted_Label_ajustado'] = np.where(df_test_coverage['Probability'] < 0.5, "No identificado", df_test_coverage['Predicted_Label'])

In [None]:
df_test_coverage['Predicted_Label_ajustado'].value_counts(normalize=True)

No identificado                          0.50
Transporte                               0.13
Alimentación                             0.11
Transferencia Bancaria                   0.08
Entretenimiento                          0.06
Inversiones                              0.05
Cuentas y servicios                      0.04
Salud                                    0.02
Servicios Profesionales y Consultoría    0.01
Name: Predicted_Label_ajustado, dtype: float64

In [None]:
df_test_coverage[df_test_coverage['Predicted_Label_ajustado']!="No identificado"].head(20)

Unnamed: 0,RAZON_SOCIAL,final_category,Predicted_Label,Probability,Predicted_Label_ajustado
506,FUNDACION MACARENA VERDE,No categorizado,Alimentación,0.993454,Alimentación
334,ORNATUS SAS,No categorizado,Inversiones,0.647832,Inversiones
477,ROLDAN URIBE INVESTMENTS S.A.S.,No categorizado,Cuentas y servicios,0.616675,Cuentas y servicios
306,INVERSIONES SAROC SAS,No categorizado,Transporte,0.762254,Transporte
763,ALMEYDA OROZCO CUATRO AES S EN C,No categorizado,Transporte,0.819362,Transporte
963,CORPORACION REDES TURISTICAS DE ANTIOQUIA,No categorizado,Inversiones,0.621349,Inversiones
152,MAPAMI INVERSIONES SAS,No categorizado,Alimentación,0.918997,Alimentación
885,COMERCIALIZADORA BEERSEBA SAS,No categorizado,Alimentación,0.564545,Alimentación
681,NOVENA ASOCIACION DE PALMICULTORES DEL DISTRIT...,No categorizado,Inversiones,0.9993,Inversiones
406,TECNICENTRO SANTA MARTA LTDA,No categorizado,Cuentas y servicios,0.979961,Cuentas y servicios


# Vemos que al usar BERT con fine tunning, el modelo mejora bastante su cobertura y efectividad, pero sigue teniendo algunas oportunidades. Se puede seguir iterando para mejorarlo. La siguiente opción, es hacer fine tunning con un modelo que esté entrenado en español, como BERTO.

In [None]:
df_test_coverage.drop(['final_category'], axis=1, inplace=True)

In [None]:
df_test_coverage.to_csv('df_test_coverage.csv', index=False)