Análisis de sentimiento con BERT

In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap

In [None]:
# Inicialización
RANDOM_SEED = 42 #Esto es una semilla. Si vuelven a correr el mismo código con la misma semilla, obtendrán un resultado muy similar
MAX_LEN = 200 #Nº máximo de palabras a tomar en cuenta por cada celda
BATCH_SIZE = 10 #Si tengo 4000 filas, las introduzco al modelo en paquetes de 16
DATASET_PATH = '/content/drive/MyDrive/Documentos personales/9-Documentos Doctorado/Doctorado 2023/7-Clases/SRGVUA/rawdata/descripcion_ejercicio3.xlsx'
NCLASSES = 2 #Comentarios positivos, negativos e intermedios (o neutros)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
#Esto es para indicarle a Google Colab que utilice una gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Cargar dataset
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel(DATASET_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(df.head())
print(df.shape)

                                         Description  EstContemp_calidad
0  Piso reformado de 4 habitaciones, salón comedo...            0.808448
1  BENET MATEU, PIS D´ORIGEN AMB MOLT BONA DISTRI...            0.625065
2  Apartamento pero con acceso independiente desd...            0.190527
3  [A2977]PISAZO, EL MEJOR DE LA ZONA.FENOMENAL P...            0.444429
4  [A3001]VIVIENDA EN LA CALLE GARROFER DE SANT I...            0.380063
(4086, 2)


In [None]:
# TOKENIZACIÓN
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [None]:
# Ejemplo tokenización
sample_txt = 'Piso con vistas a Vía Laetiana, recién reformado'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  Piso con vistas a Vía Laetiana, recién reformado
Tokens:  ['Pis', '##o', 'con', 'vistas', 'a', 'Vía', 'La', '##et', '##iana', ',', 'recién', 'reforma', '##do']
Tokens numéricos:  [22652, 30933, 1051, 8848, 1013, 14169, 1198, 1710, 15851, 1017, 9064, 5562, 1047]


In [None]:
# Codificación para introducir a BERT
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    pad_to_max_length = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)



In [None]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'Pis', '##o', 'con', 'vistas', 'a', 'Vía', 'La', '##et', '##iana', ',', 'recién', 'reforma', '##do', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [None]:
# CREACIÓN DATASET

class HabitacliaDataset(Dataset):

  def __init__(self,Description,labels,tokenizer,max_len):
    self.Description = Description
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.Description)

  def __getitem__(self, item):
    Description = str(self.Description[item])
    EstContemp_calidad = self.labels[item]
    encoding = tokenizer.encode_plus(
        Description,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
        )


    return {
          'Description': Description,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'EstContemp_calidad': torch.tensor(EstContemp_calidad, dtype=torch.long)
      }

In [None]:
# Data loader:

def data_loader(df, tokenizer, max_len, batch_size):
  dataset = HabitacliaDataset(
      Description = df.Description.to_numpy(),
      labels = df.EstContemp_calidad.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 4)

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
import torch.nn.functional as F

class BERTSentimentRegressor(nn.Module):
    def __init__(self):
        super(BERTSentimentRegressor, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)  # Una sola salida para la regresión

    def forward(self, input_ids, attention_mask):
      outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
      cls_output = outputs.last_hidden_state[:, 0, :]  # Tomamos la representación del token [CLS]
      drop_output = self.drop(cls_output)
      output = self.linear(drop_output)
      return output.view(-1)

In [None]:
model = BERTSentimentRegressor()
model = model.to(device)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ENTRENAMIENTO
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.MSELoss().to(device)



In [None]:
# Iteración entrenamiento
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['EstContemp_calidad'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels.float())  # Convertimos EstContemp_calidad a float
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return np.mean(losses)


# Cambiar las métricas de evaluación
def eval_model_regression(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    total_mae = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['EstContemp_calidad'].to(device).float()  # Convertimos labels a float
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            losses.append(loss.item())
            total_mae += F.l1_loss(outputs, labels, reduction='sum').item()
    return total_mae / n_examples, np.mean(losses)

In [None]:
# Entrenamiento
#for epoch in range(EPOCHS):
#    print(f'Epoch {epoch + 1}/{EPOCHS}')
#    print('------------------')
#    train_mae, train_loss = train_model(
#        model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
#    )
#    test_mae, test_loss = eval_model_regression(
#        model, test_data_loader, loss_fn, device, len(df_test)
#    )
#    print(f'Training:   MAE: {train_mae:.2f}, Loss: {train_loss:.2f}')
#    print(f'Validation: MAE: {test_mae:.2f}, Loss: {test_loss:.2f}')
#    print()
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('------------------')
    train_loss = train_model(
        model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
    )
    test_mae, test_loss = eval_model_regression(
        model, test_data_loader, loss_fn, device, len(df_test)
    )
    print(f'Training:   Loss: {train_loss:.2f}')
    print(f'Validation: MAE: {test_mae:.2f}, Loss: {test_loss:.2f}')
    print()


In [None]:
from sklearn.metrics import r2_score

def eval_model_r2(model, data_loader, device):
    model = model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['EstContemp_calidad'].to(device).float()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend(outputs.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return r2_score(true_labels, predictions)

r2 = eval_model_r2(model, test_data_loader, device)
print(f'R^2 Score: {r2:.4f}')

**Intento 16 de febrero - 1**

In [None]:
!pip install torch torchvision transformers

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import pandas as pd

In [None]:
# Cargar el modelo pre-entrenado BETO
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Definir una función para preparar los datos
def prepare_data(df):
    # Tokenizar los textos
    tokenized_texts = tokenizer(df['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')

    # Convertir las etiquetas de sentimiento a tensores
    labels = torch.tensor(df['EstContemp_calidad'].tolist(), dtype=torch.float32).unsqueeze(1)

    return tokenized_texts, labels

# Cargar los datos
DATASET_PATH = '/content/drive/MyDrive/Documentos personales/9-Documentos Doctorado/Doctorado 2023/7-Clases/SRGVUA/rawdata/descripcion_ejercicio3.xlsx'
drive.mount('/content/drive')
df = pd.read_excel(DATASET_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Preparar los datos
tokenized_texts, labels = prepare_data(df)

# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    save_total_limit=1,
    load_best_model_at_end=True,
)

# Definir el objeto Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_texts,
    eval_dataset=labels,
)

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch]

In [None]:
import accelerate
import transformers

In [None]:
# Entrenar el modelo
trainer.train()

# Guardar el modelo entrenado
model.save_pretrained("./sentiment_model")

# Guardar el tokenizer
tokenizer.save_pretrained("./sentiment_model")

**Intento 16 de febrero - 2**

In [None]:
!pip install transformers torch

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Cargar el modelo pre-entrenado BETO
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Cargar los datos
from google.colab import drive
DATASET_PATH = '/content/drive/MyDrive/Documentos personales/9-Documentos Doctorado/Doctorado 2023/7-Clases/SRGVUA/rawdata/descripcion_ejercicio3.xlsx'
drive.mount('/content/drive')
df = pd.read_excel(DATASET_PATH)

# Convertir los textos a tokens y los sentimientos a tensores
tokenized_texts = tokenizer(
    df['Description'].tolist(),
    padding=True,
    truncation=True,
    max_length=120,
    return_tensors='pt'
)
labels = torch.tensor(df['EstContemp_calidad'].tolist(), dtype=torch.float32).unsqueeze(1)

In [None]:
# Dividir los datos en conjuntos de entrenamiento y validación
train_inputs, val_inputs, train_labels, val_labels = train_test_split(tokenized_texts.input_ids,
                                                                      labels,
                                                                      random_state=42,
                                                                      test_size=0.3)
train_masks, val_masks, _, _ = train_test_split(tokenized_texts.attention_mask,
                                                tokenized_texts.input_ids,
                                                random_state=42,
                                                test_size=0.3)

# Crear DataLoader para el conjunto de datos de entrenamiento y validación
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Definir los hiperparámetros de entrenamiento
optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
# Entrenamiento
epochs = 2
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validación
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        total_eval_loss += outputs.loss.item()
    avg_val_loss = total_eval_loss / len(val_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss}')

KeyboardInterrupt: 

In [None]:
# Guardar el modelo entrenado
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

**16 de febrero - 3**

In [None]:
!pip install transformers torch

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd

# Cargar el modelo pre-entrenado BETO
#model_name = "dccuchile/bert-base-spanish-wwm-cased"
#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Definir una configuración personalizada para el modelo BERT con menos capas
config = BertConfig.from_pretrained("dccuchile/bert-base-spanish-wwm-case")
config.num_hidden_layers = 3  # Por ejemplo, reducir a 3 capas

# Crear un modelo BERT con la configuración personalizada
model = BertForSequenceClassification(config)


# Cargar los datos
from google.colab import drive
DATASET_PATH = '/content/drive/MyDrive/Documentos personales/9-Documentos Doctorado/Doctorado 2023/7-Clases/SRGVUA/rawdata/descripcion_ejercicio3.xlsx'
drive.mount('/content/drive')

# Limitar la cantidad de palabras en cada texto
max_length = 200

# Procesar los textos y los sentimientos
tokenized_texts = tokenizer(
    df['Description'].tolist(),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors='pt'
)
labels = torch.tensor(df['EstContemp_calidad'].tolist(), dtype=torch.float32).unsqueeze(1)

# Dividir los datos en conjuntos de entrenamiento y validación
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    tokenized_texts.input_ids,
    labels,
    random_state=42,
    test_size=0.2
)
train_masks, val_masks, _, _ = train_test_split(
    tokenized_texts.attention_mask,
    tokenized_texts.input_ids,
    random_state=42,
    test_size=0.2
)

# Crear DataLoader para el conjunto de datos de entrenamiento y validación
batch_size = 8  # Puedes ajustar el tamaño del lote aquí
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Añadir dropout
dropout_prob = 0.3  # Puedes ajustar la probabilidad de dropout aquí
model.dropout = torch.nn.Dropout(dropout_prob)

# Definir los hiperparámetros de entrenamiento
optimizer = AdamW(model.parameters(), lr=2e-5)

# Entrenamiento
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs, masks, labels = batch
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validación
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks, labels=labels)
        total_eval_loss += outputs.loss.item()
    avg_val_loss = total_eval_loss / len(val_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {avg_val_loss}')

# Guardar el modelo entrenado
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
