### 1. Configuracion del entorno y descarga de data set




In [None]:
!pip install transformers



In [None]:
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW # Corrected import
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from textwrap import wrap
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# ==================================
# 1. CONFIGURACIÓN
# ==================================
# --- Parámetros del Modelo y Tokenizador ---
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 160 # Longitud máxima para los tokens de un tweet

In [None]:
# --- Parámetros de Entrenamiento ---
BATCH_SIZE = 16
EPOCHS = 5 # Menos épocas pueden ser suficientes con BERT
RANDOM_SEED = 42

In [None]:
# --- Configuración del Dispositivo ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda:0


In [None]:
# --- Semillas para reproducibilidad ---
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda:0


In [None]:
# 🔹 Semillas adicionales para reproducibilidad completa
import random
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

###2. Limpieza de datos

In [None]:
# ==================================
# 2. CARGA Y LIMPIEZA DE DATOS
# ==================================
import kagglehub  # <-- SOLUCIÓN: Añade esta línea para importar la biblioteca

print("\nDescargando dataset 'twitter-airline-sentiment' desde KaggleHub...")
# Descarga y descomprime el dataset en un directorio local usando kagglehub
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
csv_path = f"{path}/Tweets.csv"
print(f"Dataset descargado en: {path}")



Descargando dataset 'twitter-airline-sentiment' desde KaggleHub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/crowdflower/twitter-airline-sentiment?dataset_version_number=4...


100%|██████████| 2.55M/2.55M [00:00<00:00, 64.5MB/s]

Extracting files...





Dataset descargado en: /root/.cache/kagglehub/datasets/crowdflower/twitter-airline-sentiment/versions/4


In [None]:
# Carga los datos en un DataFrame de pandas
df = pd.read_csv(csv_path)

In [None]:
# --- Preprocesamiento y Limpieza ---
# Seleccionamos solo las columnas que necesitamos

import re # Import the regular expression module

def clean_tweet(tweet: str) -> str:
    # Eliminar URLs y menciones
    tweet = re.sub(r'http\S+|www\.\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    # Colapsar espacios
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

print("Limpiando tweets...")
df['text'] = df['text'].astype(str).apply(clean_tweet)


Limpiando tweets...


In [None]:
# Mapeo de etiquetas de sentimiento a números
# El dataset original tiene 'negative', 'neutral', 'positive'
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['airline_sentiment'].map(sentiment_map)
N_CLASSES = len(sentiment_map)

print(f"\nNúmero de clases: {N_CLASSES}")
print(f"Distribución de sentimientos:\n{df['airline_sentiment'].value_counts()}")



Número de clases: 3
Distribución de sentimientos:
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


In [None]:
# ==================================
# 3. DATASET Y DATALOADER DE PYTORCH
# ==================================
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

class TweetSentimentDataset(Dataset):
    """
    Clase para empaquetar los datos en un formato que PyTorch entiende.
    """
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    """
    Función helper para crear un DataLoader a partir de un DataFrame.
    """
    ds = TweetSentimentDataset(
        texts=df.text.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    # num_workers acelera la carga de datos si la CPU tiene múltiples núcleos
    return DataLoader(ds, batch_size=batch_size, num_workers=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# --- División de datos en entrenamiento y prueba ---
# stratify asegura que la proporción de sentimientos sea la misma en ambos sets
from sklearn.model_selection import train_test_split

# 80/10/10 estratificado
df_train, df_temp = train_test_split(
    df, test_size=0.2, random_state=RANDOM_SEED, stratify=df['label']
)
df_val, df_test = train_test_split(
    df_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=df_temp['label']
)

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader   = create_data_loader(df_val,   tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader  = create_data_loader(df_test,  tokenizer, MAX_LEN, BATCH_SIZE)



In [None]:
# ==================================
# 4. DEFINICIÓN DEL MODELO BERT
# ==================================
class SentimentClassifier(nn.Module):
    """
    Modelo de clasificación que usa un modelo BERT pre-entrenado
    y añade una capa de salida para la clasificación de sentimientos.
    """
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3) # Dropout para regularización
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        # Usamos la salida 'pooled_output' que representa el sentimiento de toda la secuencia
        pooled_output = outputs[1]
        output = self.drop(pooled_output)
        return self.out(output)

model = SentimentClassifier(N_CLASSES)
model = model.to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
#Class weights opcionales
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

USE_CLASS_WEIGHTS = True  # pon False si no quieres usarlos

if USE_CLASS_WEIGHTS:
    # y: como array 1D
    y_train = df_train['label'].to_numpy()  # equivalente a .values

    # classes: como numpy array ordenado y sin duplicados
    classes = np.unique(y_train)  # p.ej. array([0, 1, 2])
    # (si quieres forzar dtype: classes = np.asarray(sorted(classes), dtype=np.int64))

    cw = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=y_train
    )
    class_weights_tensor = torch.tensor(cw, dtype=torch.float, device=device)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor).to(device)
else:
    loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
# ==================================
# 5. LÓGICA DE ENTRENAMIENTO Y EVALUACIÓN
# ==================================
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

# AdamW + weight decay más fuerte
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)

total_steps = len(train_data_loader) * EPOCHS
num_warmup_steps = int(0.1 * total_steps)  # 10%

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

from sklearn.metrics import f1_score

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses, all_preds, all_labels = [], [], []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        all_preds.extend(preds.detach().cpu().tolist())
        all_labels.extend(labels.detach().cpu().tolist())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    acc = correct_predictions.double() / n_examples
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, np.mean(losses), f1

@torch.no_grad()
def eval_epoch(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses, all_preds, all_labels = [], [], []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        all_preds.extend(preds.detach().cpu().tolist())
        all_labels.extend(labels.detach().cpu().tolist())

    acc = correct_predictions.double() / n_examples
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, np.mean(losses), f1, np.array(all_labels), np.array(all_preds)


In [None]:
# --- Bucle de entrenamiento con Early Stopping ---
print("\nIniciando entrenamiento con Early Stopping (F1 macro)...")

best_val_f1 = 0.0
patience = 2
epochs_no_improve = 0
best_model_path = 'best_model_state.bin'

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    print('-' * 30)

    train_acc, train_loss, train_f1 = train_epoch(
        model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
    )
    val_acc, val_loss, val_f1, y_val, y_pred_val = eval_epoch(
        model, val_data_loader, loss_fn, device, len(df_val)
    )

    print(f'Train   | Loss: {train_loss:.4f}  Acc: {train_acc:.4f}  F1: {train_f1:.4f}')
    print(f'Val     | Loss: {val_loss:.4f}    Acc: {val_acc:.4f}    F1: {val_f1:.4f}')

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        epochs_no_improve = 0
        print(f'✅ Mejor modelo guardado (Val F1: {best_val_f1:.4f})')
    else:
        epochs_no_improve += 1
        print(f'No mejora en F1. Paciencia: {epochs_no_improve}/{patience}')
        if epochs_no_improve >= patience:
            print('⏹️ Early stopping activado.')
            break

print("\nCargando el mejor modelo según F1 macro de validación...")
model.load_state_dict(torch.load(best_model_path, map_location=device))
model.to(device)

#Evaluación final en test

test_acc, test_loss, test_f1, y_test, y_pred_test = eval_epoch(
    model, test_data_loader, loss_fn, device, len(df_test)
)
print(f"\nTEST | Loss: {test_loss:.4f}  Acc: {test_acc:.4f}  F1: {test_f1:.4f}")

print("\nClassification report (test):")
print(classification_report(y_test, y_pred_test, digits=4))

print("Matriz de confusión (test):")
print(confusion_matrix(y_test, y_pred_test))


Iniciando entrenamiento con Early Stopping (F1 macro)...

Epoch 1/5
------------------------------
Train   | Loss: 0.7169  Acc: 0.6807  F1: 0.6336
Val     | Loss: 0.5481    Acc: 0.8204    F1: 0.7789
✅ Mejor modelo guardado (Val F1: 0.7789)

Epoch 2/5
------------------------------
Train   | Loss: 0.4086  Acc: 0.8660  F1: 0.8374
Val     | Loss: 0.5855    Acc: 0.8210    F1: 0.7840
✅ Mejor modelo guardado (Val F1: 0.7840)

Epoch 3/5
------------------------------
Train   | Loss: 0.2484  Acc: 0.9264  F1: 0.9104
Val     | Loss: 0.7744    Acc: 0.8163    F1: 0.7773
No mejora en F1. Paciencia: 1/2

Epoch 4/5
------------------------------
Train   | Loss: 0.1635  Acc: 0.9617  F1: 0.9521
Val     | Loss: 1.0540    Acc: 0.8361    F1: 0.7946
✅ Mejor modelo guardado (Val F1: 0.7946)

Epoch 5/5
------------------------------
Train   | Loss: 0.1057  Acc: 0.9758  F1: 0.9693
Val     | Loss: 1.1296    Acc: 0.8415    F1: 0.7979
✅ Mejor modelo guardado (Val F1: 0.7979)

Cargando el mejor modelo según F1 m

In [None]:
# ==================================
# 6. PREDICCIÓN CON NUEVO TEXTO
# ==================================
def classify_sentiment(text, model, tokenizer, device, max_len=160):
    model = model.eval()
    cleaned_text = clean_tweet(text)

    encoding_review = tokenizer.encode_plus(
        cleaned_text,
        add_special_tokens=True,
        max_length= MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding_review['input_ids'].to(device)
    attention_mask = encoding_review['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

    # Mapear la predicción numérica de nuevo a la etiqueta de sentimiento
    # Creamos un mapeo inverso
    id_to_sentiment = {v: k for k, v in sentiment_map.items()}
    predicted_sentiment = id_to_sentiment[preds.item()]

    return predicted_sentiment

In [None]:
# # === Guardar oficialmente modelo + tokenizer para despliegue ===
from pathlib import Path
from transformers import BertConfig, BertForSequenceClassification

# 1️⃣  Carpeta destino
MODEL_DIR = Path("sentiment_model_full_offline")
MODEL_DIR.mkdir(exist_ok=True)

# 2️⃣  Usa el config que YA está en tu modelo entrenado
config = model.bert.config            # ← viene del modelo que afinaste
config.num_labels = 3                 # asegúrate de que refleje tus clases

# 3️⃣  Crea el contenedor VACÍO (pesos aleatorios) sin descargar nada
hf_model = BertForSequenceClassification(config)   # no hay acceso a la red

# 4️⃣  Copia tus pesos entrenados
hf_model.bert.load_state_dict(model.bert.state_dict())          # encoder
hf_model.classifier.weight.data = model.out.weight.data.clone() # capa cls
hf_model.classifier.bias.data  = model.out.bias.data.clone()

# 5️⃣  Guarda todo (pesos + config) junto con el tokenizer
hf_model.save_pretrained(MODEL_DIR)     # escribe config.json + pytorch_model.bin
tokenizer.save_pretrained(MODEL_DIR)    # escribe vocab.txt, tokenizer.json, …

print("✅ Paquete listo en:", MODEL_DIR.resolve())


✅ Paquete listo en: /content/sentiment_model_full_offline


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer_test = AutoTokenizer.from_pretrained("sentiment_model_full_offline")
model_test     = AutoModelForSequenceClassification.from_pretrained("sentiment_model_full_offline").eval()

print("✔️  Se cargó sin re-entrenar")


✔️  Se cargó sin re-entrenar


In [None]:
#Comprueba que la carpeta está realmente allí
!ls -R

.:
best_model_state.bin  sample_data  sentiment_model_full_offline

./sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md

./sentiment_model_full_offline:
config.json	   special_tokens_map.json  vocab.txt
model.safetensors  tokenizer_config.json


In [None]:
#Comprime la carpeta
!zip -r sentiment_model_full_offline.zip sentiment_model_full_offline

  adding: sentiment_model_full_offline/ (stored 0%)
  adding: sentiment_model_full_offline/config.json (deflated 51%)
  adding: sentiment_model_full_offline/model.safetensors (deflated 7%)
  adding: sentiment_model_full_offline/vocab.txt (deflated 49%)
  adding: sentiment_model_full_offline/tokenizer_config.json (deflated 75%)
  adding: sentiment_model_full_offline/special_tokens_map.json (deflated 42%)


In [None]:
#Descarga el .zip al disco de tu computadora
from google.colab import files
files.download('sentiment_model_full_offline.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# # Ejemplo de uso:
# text_to_classify = "Just took a flight with United Airlines."
# predicted_sentiment = classify_sentiment(text_to_classify, model, tokenizer, device)
# print(f"The feeling of the text is: {predicted_sentiment}")

In [None]:
# # Ejemplo de uso:
# import pandas as pd

# # Cargar el nuevo dataset
# new_df = pd.read_csv("/content/jetblue.csv")

# # Asegurarnos de que la columna 'clean_title' existe y limpiarla
# if 'clean_title' in new_df.columns:
#     print("Limpiando la columna 'clean_title' del nuevo dataset...")
#     new_df['clean_title'] = new_df['clean_title'].apply(clean_tweet)
# else:
#     print("Error: La columna 'clean_title' no se encontró en el nuevo dataset.")
#     # Puedes manejar este error de otra manera si es necesario, por ejemplo, detener la ejecución o usar otra columna.
#     # Por ahora, simplemente salimos si la columna no existe.
#     exit()

# # Aplicar el modelo para predecir el sentimiento de cada entrada
# print("Clasificando sentimientos en el nuevo dataset...")
# new_df['predicted_sentiment'] = new_df['clean_title'].apply(
#     lambda x: classify_sentiment(x, model, tokenizer, device, MAX_LEN)
# )

# # Calcular el porcentaje de cada sentimiento, incluyendo los que tienen 0%
# total_count = new_df.shape[0]
# if total_count > 0:
#     sentiment_counts = new_df['predicted_sentiment'].value_counts().reindex(['negative', 'neutral', 'positive'], fill_value=0)
#     print("\nPorcentaje de sentimientos en la columna 'clean_title':")
#     for sentiment, count in sentiment_counts.items():
#         percentage = (count / total_count) * 100
#         print(f"  {sentiment}: {percentage:.2f}%")
# else:
#     print("\nEl nuevo dataset está vacío, no se pueden calcular porcentajes.")


# # Opcional: Mostrar las primeras filas con las predicciones
# display(new_df.head())