In [1]:
# Celda 1: Instalación de dependencias
!pip install transformers
!pip install tensorflow
!pip install scikit-learn
!pip install pandas numpy matplotlib seaborn



In [2]:
# Celda 2: Importar librerías necesarias
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Celda 3: Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [6]:
# Celda 4: Definir la clase SimpleBERTClassifier
class SimpleBERTClassifier:
    def __init__(self, max_length=128):
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = None

    def build_model(self):
        # Modelo base BERT
        bert = TFBertModel.from_pretrained('bert-base-uncased')
        
        # Inputs
        input_ids = tf.keras.layers.Input(shape=(self.max_length,), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=(self.max_length,), dtype=tf.int32, name='attention_mask')
        
        # BERT layer
        sequence_output = bert(input_ids, attention_mask=attention_mask)[0]
        
        # Simple pooling y clasificación
        pooled_output = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
        dropout = tf.keras.layers.Dropout(0.1)(pooled_output)
        output = tf.keras.layers.Dense(3, activation='softmax')(dropout)
        
        # Crear y compilar modelo
        self.model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )

    def prepare_data(self, texts, labels, test_size=0.2):
        # Asegurarse de que los textos sean una lista de strings
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        
        # Verificar y limpiar textos
        texts = [str(text).strip() for text in texts]
        
        try:
            # Tokenización
            encodings = self.tokenizer(
                texts,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='tf'
            )
            
            # Dividir datos
            X = {
                'input_ids': encodings['input_ids'],
                'attention_mask': encodings['attention_mask']
            }
            
            # Train-test split
            train_idx, test_idx = train_test_split(
                range(len(texts)), 
                test_size=test_size, 
                random_state=42,
                stratify=np.argmax(labels, axis=1)  # Estratificación por clase
            )
            
            X_train = {
                'input_ids': tf.gather(X['input_ids'], train_idx),
                'attention_mask': tf.gather(X['attention_mask'], train_idx)
            }
            X_test = {
                'input_ids': tf.gather(X['input_ids'], test_idx),
                'attention_mask': tf.gather(X['attention_mask'], test_idx)
            }
            
            y_train = tf.gather(labels, train_idx)
            y_test = tf.gather(labels, test_idx)
            
            return X_train, X_test, y_train, y_test
            
        except Exception as e:
            print(f"Error en la preparación de datos: {str(e)}")
            print(f"Ejemplo de texto problemático: {texts[0][:100]}")
            raise

    def train(self, X_train, y_train, X_test, y_test, epochs=3, batch_size=16):
        history = self.model.fit(
            X_train,
            y_train,
            validation_data=(X_test, y_test),
            epochs=epochs,
            batch_size=batch_size,
            verbose=1
        )
        return history

In [7]:
# Celda 5: Cargar y preparar datos
# Ajusta la ruta según tu estructura en Drive
data_file = "/content/drive/MyDrive/IronHack/Proyecto4/data/processed/reviews_preprocessed.csv"

# Cargar datos
df = pd.read_csv(data_file)

# Limitar el dataset para pruebas iniciales (opcional, pero recomendado para empezar)
df = df.sample(n=10000, random_state=42)

# Limpiar datos
df['reviews.text_processed'] = df['reviews.text_processed'].fillna('')  # Manejar valores nulos
df['reviews.text_processed'] = df['reviews.text_processed'].astype(str)  # Convertir a string

# Convertir ratings a sentimientos
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

# Preparar datos
texts = df['reviews.text_processed'].tolist()  # Convertir a lista
sentiments = df['reviews.rating'].apply(rating_to_sentiment)

# Mostrar distribución de clases
print("Distribución de clases:")
print(sentiments.value_counts())
print("\nPorcentajes:")
print(sentiments.value_counts(normalize=True) * 100)

# Verificar datos
print("\nVerificación de datos:")
print(f"Número de textos: {len(texts)}")
print(f"Ejemplo de texto: {texts[0][:100]}...")
print(f"Tipo de datos de texts: {type(texts)}")

# Codificar etiquetas
le = LabelEncoder()
encoded_sentiments = le.fit_transform(sentiments)
labels = tf.keras.utils.to_categorical(encoded_sentiments)

# Verificar dimensiones
print("\nDimensiones de los datos:")
print(f"Número de etiquetas: {len(labels)}")
print(f"Forma de las etiquetas: {labels.shape}")

ValueError: Cannot set memory growth on device when virtual devices configured

In [None]:
# Celda 6: Entrenar modelo
# Inicializar clasificador
classifier = SimpleBERTClassifier(max_length=128)

# Construir modelo
classifier.build_model()

# Preparar datos
X_train, X_test, y_train, y_test = classifier.prepare_data(texts, labels)

# Entrenar modelo
history = classifier.train(
    X_train, 
    y_train, 
    X_test, 
    y_test,
    epochs=3,
    batch_size=16
)

In [None]:
# Celda 7: Visualizar resultados
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()