In [None]:
# Celda 1: Instalación de dependencias
!pip install transformers
!pip install tensorflow
!pip install seaborn

In [None]:
# Celda 2: Importaciones
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import logging
import json
from google.colab import drive

# Montar Google Drive
drive.mount('/content/drive')

# Configurar logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [None]:
class SimpleBERTClassifier:
    def __init__(self, max_length=64):
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = None
    
    def build_model(self):
        """Construye un modelo BERT para clasificación"""
        # Input layers
        input_ids = tf.keras.layers.Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        # BERT layer con trainable=True
        bert = TFBertModel.from_pretrained('bert-base-uncased', trainable=True)
        bert_outputs = bert(input_ids, attention_mask=attention_mask)[0]
        
        # Usar el token [CLS] y añadir capas adicionales
        cls_token = bert_outputs[:, 0, :]
        dropout = tf.keras.layers.Dropout(0.1)(cls_token)
        dense1 = tf.keras.layers.Dense(256, activation='relu')(dropout)
        dropout2 = tf.keras.layers.Dropout(0.1)(dense1)
        outputs = tf.keras.layers.Dense(3, activation='softmax')(dropout2)
        
        # Crear modelo
        self.model = tf.keras.Model(
            inputs={'input_ids': input_ids, 'attention_mask': attention_mask},
            outputs=outputs
        )
        
        # Compilar con optimizador personalizado y learning rate bajo
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
        self.model.compile(
            optimizer=optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        
        return self.model
    
    def prepare_data(self, data_file, samples_per_class=30):
        """Prepara una muestra balanceada pequeña de datos"""
        logger.info("Iniciando preparación de datos...")
        
        try:
            # Cargar datos
            df = pd.read_csv(data_file)
            logger.info(f"Tamaño original del DataFrame: {len(df)}")
            
            # Verificar columnas requeridas
            required_columns = ['reviews.text_processed', 'sentiment']
            if not all(col in df.columns for col in required_columns):
                raise ValueError(f"Faltan columnas requeridas: {required_columns}")
            
            # Validar valores de sentiment
            valid_sentiments = ['negative', 'neutral', 'positive']
            if not all(df['sentiment'].isin(valid_sentiments)):
                raise ValueError(f"Valores inválidos en columna 'sentiment'")
            
            # Tomar muestra balanceada
            balanced_df = pd.DataFrame()
            for sentiment in valid_sentiments:
                sentiment_samples = df[df['sentiment'] == sentiment].sample(
                    n=min(samples_per_class, len(df[df['sentiment'] == sentiment])),
                    random_state=42
                )
                balanced_df = pd.concat([balanced_df, sentiment_samples])
            
            # Mezclar el DataFrame balanceado
            balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
            
            # Preparar textos y etiquetas
            texts = balanced_df['reviews.text_processed'].fillna('').astype(str).str[:self.max_length*4].tolist()
            sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
            labels = pd.get_dummies(balanced_df['sentiment'].map(sentiment_map)).values
            
            # Tokenizar con manejo de errores
            try:
                encodings = self.tokenizer(
                    texts,
                    truncation=True,
                    padding='max_length',
                    max_length=self.max_length,
                    return_tensors='tf'
                )
            except Exception as e:
                logger.error(f"Error en tokenización: {str(e)}")
                raise
            
            # Convertir a tensores de TensorFlow
            input_ids = tf.convert_to_tensor(encodings['input_ids'], dtype=tf.int32)
            attention_mask = tf.convert_to_tensor(encodings['attention_mask'], dtype=tf.int32)
            labels = tf.convert_to_tensor(labels, dtype=tf.float32)
            
            # Dividir datos de manera estratificada
            total_size = len(texts)
            train_size = int(0.8 * total_size)
            
            # Crear índices aleatorios
            indices = tf.range(total_size)
            indices = tf.random.shuffle(indices)
            
            train_indices = indices[:train_size]
            val_indices = indices[train_size:]
            
            return {
                'train': {
                    'input_ids': tf.gather(input_ids, train_indices),
                    'attention_mask': tf.gather(attention_mask, train_indices),
                    'labels': tf.gather(labels, train_indices)
                },
                'val': {
                    'input_ids': tf.gather(input_ids, val_indices),
                    'attention_mask': tf.gather(attention_mask, val_indices),
                    'labels': tf.gather(labels, val_indices)
                }
            }
            
        except Exception as e:
            logger.error(f"Error en prepare_data: {str(e)}")
            raise

In [None]:
def main():
    """Función principal para entrenamiento del modelo"""
    data_file = "/content/drive/MyDrive/IronHack/Proyecto4/nlp-project/data/processed/reviews_preprocessed.csv"
    
    try:
        # Crear clasificador
        classifier = SimpleBERTClassifier(max_length=32)
        
        # Preparar datos balanceados
        logger.info("Preparando datos...")
        data = classifier.prepare_data(
            data_file, 
            samples_per_class=30
        )
        
        # Construir y entrenar modelo
        logger.info("Construyendo y entrenando modelo...")
        model = classifier.build_model()
        
        # Configurar callbacks
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=2,
                restore_best_weights=True
            ),
            tf.keras.callbacks.ModelCheckpoint(
                'best_model.h5',
                monitor='val_accuracy',
                save_best_only=True
            )
        ]
        
        # Entrenar
        history = model.fit(
            {
                'input_ids': data['train']['input_ids'],
                'attention_mask': data['train']['attention_mask']
            },
            data['train']['labels'],
            validation_data=(
                {
                    'input_ids': data['val']['input_ids'],
                    'attention_mask': data['val']['attention_mask']
                },
                data['val']['labels']
            ),
            epochs=3,
            batch_size=8,
            callbacks=callbacks
        )
        
        logger.info("Entrenamiento completado")
        return history, model
        
    except Exception as e:
        logger.error(f"Error durante el proceso: {str(e)}")
        raise

In [None]:
# Verificar datos antes de entrenar
data_file = "/content/drive/MyDrive/IronHack/Proyecto4/nlp-project/data/processed/reviews_preprocessed.csv"
if Path(data_file).exists():
    df = pd.read_csv(data_file)
    print("Dimensiones del DataFrame:", df.shape)
    print("\nDistribución de sentiment:")
    print(df['sentiment'].value_counts())
    print("\nMuestra de texto:")
    print(df['reviews.text_processed'].iloc[0][:100])

In [None]:
# Celda 6: Ejecutar el entrenamiento
if __name__ == "__main__":
    main()

In [None]:
# Visualizar las métricas de entrenamiento
def plot_training_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot accuracy
    ax1.plot(history.history['accuracy'])
    ax1.plot(history.history['val_accuracy'])
    ax1.set_title('Model accuracy')
    ax1.set_ylabel('Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.legend(['Train', 'Validation'], loc='upper left')
    
    # Plot loss
    ax2.plot(history.history['loss'])
    ax2.plot(history.history['val_loss'])
    ax2.set_title('Model loss')
    ax2.set_ylabel('Loss')
    ax2.set_xlabel('Epoch')
    ax2.legend(['Train', 'Validation'], loc='upper left')
    
    plt.tight_layout()
    plt.show()

# Después del entrenamiento
history, model = main()
plot_training_history(history)