# Bert_sentiment_analysis

Notebook para entrenamiento del modelo BERT para el analisis de sentimientos

In [None]:
!pip install pandas numpy scikit-learn transformers torch imblearn tqdm

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

import os

In [None]:
# Load the dataset and inspect basic information
test_df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='ISO-8859-1')  
df = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='ISO-8859-1')  

print(test_df['sentiment'].value_counts())

print(test_df.shape[0])

df.head()

In [None]:
def clean_text(text):
    import re

    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)  # Remove mentions
    text = re.sub(r"#", '', text)  # Remove hashtag symbol only
    return text.strip().lower()

In [None]:
df['text'] = df['text'].fillna('')
df['text'] = df['text'].astype(str).apply(clean_text)
df = df.dropna(subset=['sentiment'])
test_df = test_df.dropna(subset=['sentiment'])
test_df['text'] = test_df['text'].fillna('')
test_df['text'] = test_df['text'].astype(str).apply(clean_text)

train_label_encoder = LabelEncoder()
df['sentiment_label'] = train_label_encoder.fit_transform(df['sentiment'])
print(f'Sentiment categories: {train_label_encoder.classes_}')

test_label_encoder = LabelEncoder()
test_df['sentiment_label'] = test_label_encoder.fit_transform(test_df['sentiment'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
class Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = Dataset(df['text'].tolist(), df['sentiment_label'].tolist(), tokenizer)
val_dataset = Dataset(test_df['text'].tolist(), test_df['sentiment_label'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = True)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 5
train_loss_list, val_loss_list = [],[]

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), labels=batch['labels'].to(device))

        train_loss += outputs.loss.item()
        outputs.loss.backward()
        optimizer.step()          
        optimizer.zero_grad() 

    print(f"Epoch {epoch+1} Train Loss: {train_loss/len(train_loader)}")
    train_loss_list.append(train_loss/len(train_loader))

    model.eval()
    val_loss = 0
    preds, true_pred = [],[]
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"): 
            outputs = model(batch['input_ids'].to(device), attention_mask=batch['attention_mask'].to(device), labels=batch['labels'].to(device))
            val_loss += outputs.loss.item()

            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            true_pred.extend(batch['labels'].numpy())
    print(f"Epoch {epoch+1} Val Loss: {val_loss/len(val_loader)}")
    val_loss_list.append(val_loss/len(val_loader))
    print(classification_report(true_pred, preds, target_names=test_label_encoder.classes_))
            



    

In [None]:
# Graficar
plt.plot(range(1,epochs+1), train_loss_list, label='Training Loss')
plt.plot(range(1,epochs+1), val_loss_list, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
id2label = {0: "Negativo", 1: "Neutro", 2: "Positivo"}

def pred_sent(texto):
    # Tokenizar la entrada
    inputs = tokenizer(
            texto,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

    # Mover los tensores al dispositivo (CPU o GPU)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Desactivar gradientes para inferencia
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred_id = torch.argmax(logits, dim=1).item()

    #return id2label[pred_id]
    return pred_id


In [None]:
#texto = "I'm so incredibly angry with how things have turned out. I gave everything I had — my time, my energy, my trust — and for what? To be ignored, dismissed, and treated like I never mattered. People keep crossing the line, thinking I’ll stay quiet forever, but I’m done pretending it’s okay. I’m tired of the fake apologies, the empty promises, and the constant disrespect. I’ve kept my mouth shut for far too long, but not anymore. I deserve better than this, and I won’t tolerate being walked all over just to keep the peace. If no one’s going to take me seriously, then they’re about to see what happens when I stop holding back."
#sentimiento = predecir_sentimiento(texto)
#print(f"Sentimiento: {sentimiento}")

In [None]:
sent_list = []
for i in range(test_df.shape[0]):
    text = str(test_df['text'][i])
    sent = pred_sent(text)
    sent_list.append(sent)

In [None]:
cm = confusion_matrix(test_df['sentiment_label'], sent_list)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=train_label_encoder.classes_)
disp.plot()

In [None]:
os.makedirs("/kaggle/working/bert_sentiment_model", exist_ok=True)
os.makedirs("/kaggle/working/bert_sentiment_model/model", exist_ok=True)
os.makedirs("/kaggle/working/bert_sentiment_model/tokenizer", exist_ok=True)

# Save model
model.save_pretrained('/kaggle/working/bert_sentiment_model/model/bert_sentiment_model', save_embedding_model=False)
tokenizer.save_pretrained('/kaggle/working/bert_sentiment_model/tokenizer/bert_tokenizer')