In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import logging
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


In [2]:

class BalancedSentimentClassifier:
    """
    Clase para entrenar y evaluar modelos de clasificación de sentimiento con datos balanceados.
    """
    
    def __init__(self):
        """Inicializa el clasificador con diferentes modelos y técnicas de balanceo."""
        self.models = {
            'naive_bayes_balanced': ImbPipeline([
                ('tfidf', TfidfVectorizer(max_features=10000)),
                ('smote', SMOTE(random_state=42)),
                ('clf', MultinomialNB())
            ]),
            'logistic_regression_balanced': ImbPipeline([
                ('tfidf', TfidfVectorizer(max_features=10000)),
                ('smote', SMOTE(random_state=42)),
                ('clf', LogisticRegression(max_iter=1000))
            ]),
            'svm_balanced': ImbPipeline([
                ('tfidf', TfidfVectorizer(max_features=10000)),
                ('smote', SMOTE(random_state=42)),
                ('clf', LinearSVC(max_iter=1000))
            ]),
            'random_forest_combined': ImbPipeline([
                ('tfidf', TfidfVectorizer(max_features=10000)),
                ('sampling', SMOTE(random_state=42)),
                ('clf', RandomForestClassifier(n_estimators=100, class_weight='balanced'))
            ])
        }
        
        self.trained_models = {}
    
    def prepare_data(self, data_file: str):
        """
        Prepara los datos para el entrenamiento.
        
        Args:
            data_file (str): Ruta al archivo de datos preprocesados
        
        Returns:
            tuple: X_train, X_test, y_train, y_test
        """
        logger.info("Cargando y preparando datos...")
        
        # Cargar datos
        df = pd.read_csv(data_file)
        
        # Verificar y reportar valores nulos
        null_counts = df[['reviews.text_processed', 'sentiment']].isnull().sum()
        logger.info(f"Valores nulos antes de limpieza:\n{null_counts}")
        
        # Limpiar valores nulos
        df['reviews.text_processed'] = df['reviews.text_processed'].fillna('')
        df = df.dropna(subset=['sentiment'])
        
        # Verificar datos después de limpieza
        logger.info(f"Registros después de limpieza: {len(df)}")
        
        # Usar el texto preprocesado
        X = df['reviews.text_processed']
        y = df['sentiment']
        
        # Verificar balance de clases original
        class_distribution = y.value_counts()
        logger.info(f"\nDistribución de clases original:\n{class_distribution}")
        
        # División train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        logger.info(f"Datos divididos - Train: {len(X_train)}, Test: {len(X_test)}")
        return X_train, X_test, y_train, y_test
    
    def train_and_evaluate(self, X_train, X_test, y_train, y_test, output_dir: str):
        """
        Entrena y evalúa todos los modelos con datos balanceados.
        
        Args:
            X_train, X_test, y_train, y_test: Datos de entrenamiento y prueba
            output_dir (str): Directorio para guardar resultados
        """
        results = {}
        
        # Crear directorio si no existe
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        for name, model in self.models.items():
            logger.info(f"\nEntrenando modelo balanceado: {name}")
            
            # Entrenar modelo
            model.fit(X_train, y_train)
            self.trained_models[name] = model
            
            # Evaluar modelo
            y_pred = model.predict(X_test)
            
            # Guardar resultados
            results[name] = {
                'classification_report': classification_report(y_test, y_pred),
                'confusion_matrix': confusion_matrix(y_test, y_pred)
            }
            
            # Guardar modelo
            model_file = output_path / f"{name}_model.joblib"
            joblib.dump(model, model_file)
            
            # Imprimir resultados
            logger.info(f"\nResultados para {name}:")
            logger.info("\nClassification Report:")
            logger.info(f"\n{results[name]['classification_report']}")
            
            # Visualizar matriz de confusión
            self._plot_confusion_matrix(
                results[name]['confusion_matrix'],
                name,
                output_path / f"{name}_confusion_matrix.png"
            )
        
        return results
    
    def _plot_confusion_matrix(self, cm, model_name: str, output_file: str):
        """
        Visualiza y guarda la matriz de confusión.
        
        Args:
            cm: Matriz de confusión
            model_name (str): Nombre del modelo
            output_file (str): Ruta para guardar la visualización
        """
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Matriz de Confusión - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig(output_file)
        plt.close()


In [4]:

def main():
    """Función principal para ejecutar el entrenamiento de modelos balanceados."""
    # Configuración
    data_file = "../data/processed/reviews_preprocessed.csv"
    output_dir = "../models/balanced"
    
    logger.info("Iniciando entrenamiento de modelos balanceados...")
    
    try:
        # Inicializar clasificador
        classifier = BalancedSentimentClassifier()
        
        # Preparar datos
        X_train, X_test, y_train, y_test = classifier.prepare_data(data_file)
        
        # Entrenar y evaluar modelos
        results = classifier.train_and_evaluate(
            X_train, X_test, y_train, y_test, output_dir
        )
        
        logger.info("\nEntrenamiento de modelos balanceados completado exitosamente.")
        
    except Exception as e:
        logger.error(f"Error durante el entrenamiento: {str(e)}")
        raise

if __name__ == "__main__":
    main() 

2025-01-22 13:13:33,682 - __main__ - INFO - Iniciando entrenamiento de modelos balanceados...
2025-01-22 13:13:33,683 - __main__ - INFO - Cargando y preparando datos...
2025-01-22 13:13:33,953 - __main__ - INFO - Valores nulos antes de limpieza:
reviews.text_processed    26
sentiment                  0
dtype: int64
2025-01-22 13:13:33,972 - __main__ - INFO - Registros después de limpieza: 67992
2025-01-22 13:13:33,974 - __main__ - INFO - 
Distribución de clases original:
sentiment
positive    62547
neutral      2902
negative     2543
Name: count, dtype: int64
2025-01-22 13:13:33,999 - __main__ - INFO - Datos divididos - Train: 54393, Test: 13599
2025-01-22 13:13:34,004 - __main__ - INFO - 
Entrenando modelo balanceado: naive_bayes_balanced
2025-01-22 13:13:35,007 - __main__ - INFO - 
Resultados para naive_bayes_balanced:
2025-01-22 13:13:35,007 - __main__ - INFO - 
Classification Report:
2025-01-22 13:13:35,008 - __main__ - INFO - 
              precision    recall  f1-score   support
