# Test classifier

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import time
import warnings
warnings.filterwarnings('ignore')

# Load data
data_folder = os.path.join(os.path.dirname(os.getcwd()), 'data')
data_train = pd.read_csv(os.path.join(data_folder, 'train.csv'))
print(f"Dataset shape: {data_train.shape}")
print(f"Label distribution:\n{data_train['label'].value_counts()}")
data_train.head()

## Setup and Import Classification Module

In [None]:
# Cell 2: 
print("Current working directory:", os.getcwd())
os.chdir(os.path.dirname(os.getcwd()))
print("Changed working directory to:", os.getcwd())

from src.classification import SklearnSentimentClassifier, KerasSentimentClassifier
from src.embedding import TextEmbedder
from src.preprocessing import Preprocessing
from imblearn.combine import SMOTETomek

print("Modules imported successfully")

## Data Preparation


In [None]:
# Use a larger subset for classification testing
subset_size = 2000
sample_data = data_train.sample(n=subset_size, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    sample_data['text'], sample_data['label'], 
    test_size=0.2, stratify=sample_data['label'], random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training label distribution:\n{pd.Series(y_train).value_counts()}")

## Generate Embeddings for Classification

In [None]:
# We'll use the best performing embedding from your previous analysis
print("Generating embeddings...")

# Use SentenceTransformer for high-quality embeddings
embedder = TextEmbedder(method='transformer', model_name='sentence-transformers/all-MiniLM-L6-v2')
embedder.fit(X_train.tolist())

X_train_embeddings = embedder.transform(X_train.tolist())
X_test_embeddings = embedder.transform(X_test.tolist())

print(f"Embedding shape: {X_train_embeddings.shape}")
print(f"Embedding dimension: {X_train_embeddings.shape[1]}")

### Balance Dataset


In [None]:
# Apply SMOTE-Tomek for better class balance
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_embeddings, y_train)

print(f"Original training set size: {len(X_train_embeddings)}")
print(f"Balanced training set size: {len(X_train_balanced)}")
print(f"Balanced label distribution:\n{pd.Series(y_train_balanced).value_counts()}")

## Performance Testing


In [None]:
sklearn_configs = [
    {
        'name': 'Logistic Regression',
        'model_type': 'logistic_regression',
        'params': {'C': 1.0, 'max_iter': 1000}
    },
    {
        'name': 'Random Forest',
        'model_type': 'random_forest',
        'params': {'n_estimators': 100, 'max_depth': 10}
    },
    {
        'name': 'SVM (RBF)',
        'model_type': 'svm',
        'params': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}
    },
    {
        'name': 'SVM (Linear)',
        'model_type': 'svm',
        'params': {'C': 1.0, 'kernel': 'linear'}
    },
    {
        'name': 'Naive Bayes',
        'model_type': 'naive_bayes',
        'params': {}
    }
]

keras_configs = [
    {
        'name': 'Neural Network (Small)',
        'params': {
            'hidden_layers': [64, 32],
            'dropout_rate': 0.3,
            'learning_rate': 0.001}
    },
    {
        'name': 'Neural Network (Medium)',
        'params': {
            'hidden_layers': [128, 64, 32],
            'dropout_rate': 0.4,
            'learning_rate': 0.001
        }
    },
    {
        'name': 'Neural Network (Large)',
        'params': {
            'hidden_layers': [256, 128, 64, 32],
            'dropout_rate': 0.5,
            'learning_rate': 0.0001
        }
    }
]

print(f"Configured {len(sklearn_configs)} Sklearn classifiers")
print(f"Configured {len(keras_configs)} Keras classifiers")

###  Sklearn Classifiers

In [None]:
sklearn_results = []
sklearn_models = {}

print(" Testing Sklearn Classifiers...")
print("=" * 50)

for config in sklearn_configs:
    print(f"\nTesting {config['name']}...")
    
    try:
        # Initialize classifier
        classifier = SklearnSentimentClassifier(
            model_type=config['model_type'], 
            **config['params']
        )
        
        # Prepare labels
        y_train_encoded = classifier.prepare_labels(pd.Series(y_train_balanced))
        y_test_encoded = classifier.transform_labels(pd.Series(y_test))
        
        # Time training
        start_time = time.time()
        classifier.train(X_train_balanced, y_train_encoded)
        train_time = time.time() - start_time
        
        # Time prediction
        start_time = time.time()
        predictions = classifier.predict(X_test_embeddings)
        predict_time = time.time() - start_time
        
        # Evaluate
        results = classifier.evaluate(X_test_embeddings, y_test_encoded)
        
        # Cross-validate
        cv_results = classifier.cross_validate(X_train_balanced, y_train_encoded, cv=5)
        
        # Store results
        sklearn_results.append({
            'Method': config['name'],
            'Model Type': config['model_type'],
            'Accuracy': results['accuracy'],
            'CV Mean': cv_results['mean_score'],
            'CV Std': cv_results['std_score'],
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time,
            'Total Time (s)': train_time + predict_time,
            'Predictions/sec': len(X_test_embeddings) / predict_time
        })
        
        # Store model for later analysis
        sklearn_models[config['name']] = {
            'classifier': classifier,
            'predictions': predictions,
            'results': results
        }
        
        print(f"    Success - Accuracy: {results['accuracy']:.3f}, CV: {cv_results['mean_score']:.3f} ± {cv_results['std_score']:.3f}")
        
    except Exception as e:
        print(f"    Failed: {str(e)}")
        sklearn_results.append({
            'Method': config['name'],
            'Error': str(e)
        })

# Create results DataFrame
sklearn_df = pd.DataFrame(sklearn_results)
print("\n📊 Sklearn Classifier Results:")
print(sklearn_df.round(4))

### Keras Classifiers

In [None]:
keras_results = []
keras_models = {}

print("\n Testing Keras Neural Networks...")
print("=" * 50)

for config in keras_configs:
    print(f"\nTesting {config['name']}...")
    
    try:
        # Initialize classifier
        classifier = KerasSentimentClassifier(**config['params'])
        
        # Prepare labels
        y_train_encoded = classifier.prepare_labels(pd.Series(y_train_balanced))
        y_test_encoded = classifier.transform_labels(pd.Series(y_test))
        
        # Split training data for validation
        X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
            X_train_balanced, y_train_encoded, test_size=0.2, random_state=42
        )
        
        # Time training
        start_time = time.time()
        history = classifier.train(
            X_train_split, y_train_split,
            X_val=X_val_split, y_val=y_val_split,
            epochs=50, batch_size=32, verbose=0
        )
        train_time = time.time() - start_time
        
        # Time prediction
        start_time = time.time()
        predictions = classifier.predict(X_test_embeddings)
        predict_time = time.time() - start_time
        
        # Evaluate
        results = classifier.evaluate(X_test_embeddings, y_test_encoded)
        
        # Store results
        keras_results.append({
            'Method': config['name'],
            'Hidden Layers': str(config['params']['hidden_layers']),
            'Accuracy': results['accuracy'],
            'Test Loss': results['test_loss'],
            'Train Time (s)': train_time,
            'Predict Time (s)': predict_time,
            'Total Time (s)': train_time + predict_time,
            'Epochs Trained': len(history.history['loss']),
            'Parameters': classifier.model.count_params()
        })
        
        # Store model for later analysis
        keras_models[config['name']] = {
            'classifier': classifier,
            'predictions': predictions,
            'results': results,
            'history': history
        }
        
        print(f"     Success - Accuracy: {results['accuracy']:.3f}, Loss: {results['test_loss']:.3f}")
        
    except Exception as e:
        print(f"     Failed: {str(e)}")
        keras_results.append({
            'Method': config['name'],
            'Error': str(e)
        })

# Create results DataFrame
keras_df = pd.DataFrame(keras_results)
print("\n Keras Classifier Results:")
print(keras_df.round(4))

In [None]:
sklearn_df

## Performance Visualization with Additional Metrics

In [None]:
print(" Debugging DataFrame structures:")
print("\nSklearn DataFrame columns:", sklearn_df.columns.tolist())
print("Sklearn DataFrame shape:", sklearn_df.shape)
print("\nKeras DataFrame columns:", keras_df.columns.tolist())
print("Keras DataFrame shape:", keras_df.shape)


# Combine successful results for comparison with proper error handling
successful_sklearn = sklearn_df.dropna(subset=['Accuracy']) if 'Accuracy' in sklearn_df.columns else pd.DataFrame()
successful_keras = keras_df.dropna(subset=['Accuracy']) if 'Accuracy' in keras_df.columns else pd.DataFrame()

print(f"\n Successful models found:")
print(f"   • Sklearn models: {len(successful_sklearn)}")
print(f"   • Keras models: {len(successful_keras)}")

if len(successful_sklearn) > 0 or len(successful_keras) > 0:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    
    # Accuracy comparison with value labels
    all_methods = []
    all_accuracies = []
    all_colors = []
    
    if len(successful_sklearn) > 0:
        all_methods.extend(successful_sklearn['Method'])
        all_accuracies.extend(successful_sklearn['Accuracy'])
        all_colors.extend(['lightcoral'] * len(successful_sklearn))
    
    if len(successful_keras) > 0:
        all_methods.extend(successful_keras['Method'])
        all_accuracies.extend(successful_keras['Accuracy'])
        all_colors.extend(['lightblue'] * len(successful_keras))
    
    if len(all_methods) > 0:
        bars1 = ax1.bar(range(len(all_methods)), all_accuracies, color=all_colors)
        ax1.set_title('Classification Accuracy Comparison', fontsize=14, fontweight='bold')
        ax1.set_ylabel('Accuracy', fontsize=12)
        ax1.set_xticks(range(len(all_methods)))
        ax1.set_xticklabels(all_methods, rotation=45, ha='right')
        ax1.grid(True, alpha=0.3)
        ax1.set_ylim(0, 1.0)
        
        # Add value labels on bars
        for bar, acc in zip(bars1, all_accuracies):
            height = bar.get_height()
            ax1.annotate(f'{acc:.3f}',
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=10)
    else:
        ax1.text(0.5, 0.5, 'No Models\nAvailable', 
                ha='center', va='center', transform=ax1.transAxes, fontsize=14)
        ax1.set_title('Classification Accuracy Comparison', fontsize=14, fontweight='bold')
    
    # Training time comparison with log scale if needed
    all_train_times = []
    if len(successful_sklearn) > 0 and 'Train Time (s)' in successful_sklearn.columns:
        all_train_times.extend(successful_sklearn['Train Time (s)'])
    if len(successful_keras) > 0 and 'Train Time (s)' in successful_keras.columns:
        all_train_times.extend(successful_keras['Train Time (s)'])
    
    if len(all_train_times) > 0:
        bars2 = ax2.bar(range(len(all_methods)), all_train_times, color=all_colors)
        ax2.set_title('Training Time Comparison', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Time (seconds)', fontsize=12)
        ax2.set_xticks(range(len(all_methods)))
        ax2.set_xticklabels(all_methods, rotation=45, ha='right')
        ax2.grid(True, alpha=0.3)
        
        # Use log scale if there's a large difference in training times
        time_ratio = max(all_train_times) / min(all_train_times) if min(all_train_times) > 0 else 1
        if time_ratio > 10:
            ax2.set_yscale('log')
            ax2.set_ylabel('Time (seconds) - Log Scale', fontsize=12)
        
        # Add value labels on bars
        for bar, time_val in zip(bars2, all_train_times):
            height = bar.get_height()
            ax2.annotate(f'{time_val:.2f}s',
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=10)
    else:
        ax2.text(0.5, 0.5, 'No Training Time\nData Available', 
                ha='center', va='center', transform=ax2.transAxes, fontsize=14)
        ax2.set_title('Training Time Comparison', fontsize=14, fontweight='bold')
    
    # Cross-validation scores (Sklearn only) with enhanced styling
    if len(successful_sklearn) > 0 and 'CV Mean' in successful_sklearn.columns:
        bars3 = ax3.bar(successful_sklearn['Method'], successful_sklearn['CV Mean'], 
                       yerr=successful_sklearn['CV Std'] if 'CV Std' in successful_sklearn.columns else None, 
                       color='lightcoral', alpha=0.7, 
                       capsize=5, error_kw={'linewidth': 2, 'markeredgewidth': 2})
        ax3.set_title('Cross-Validation Scores (Sklearn Models)', fontsize=14, fontweight='bold')
        ax3.set_ylabel('CV Mean Accuracy', fontsize=12)
        ax3.tick_params(axis='x', rotation=45)
        ax3.grid(True, alpha=0.3)
        ax3.set_ylim(0, 1.0)
        
        # Add value labels
        if 'CV Std' in successful_sklearn.columns:
            for i, (mean_val, std_val) in enumerate(zip(successful_sklearn['CV Mean'], successful_sklearn['CV Std'])):
                ax3.annotate(f'{mean_val:.3f}±{std_val:.3f}',
                            xy=(i, mean_val + std_val),
                            xytext=(0, 3),
                            textcoords="offset points",
                            ha='center', va='bottom', fontsize=10)
        else:
            for i, mean_val in enumerate(successful_sklearn['CV Mean']):
                ax3.annotate(f'{mean_val:.3f}',
                            xy=(i, mean_val),
                            xytext=(0, 3),
                            textcoords="offset points",
                            ha='center', va='bottom', fontsize=10)
    else:
        ax3.text(0.5, 0.5, 'No Sklearn Models\nAvailable', 
                ha='center', va='center', transform=ax3.transAxes, fontsize=14)
        ax3.set_title('Cross-Validation Scores (Sklearn Models)', fontsize=14, fontweight='bold')
    
    # Model complexity (Parameters for Keras) with better formatting
    if len(successful_keras) > 0 and 'Parameters' in successful_keras.columns:
        bars4 = ax4.bar(successful_keras['Method'], successful_keras['Parameters'], color='lightblue')
        ax4.set_title('Model Complexity (Neural Networks)', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Number of Parameters', fontsize=12)
        ax4.tick_params(axis='x', rotation=45)
        ax4.grid(True, alpha=0.3)
        
        # Format y-axis for better readability
        ax4.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}K' if x >= 1000 else f'{int(x)}'))
        
        # Add value labels
        for bar, param_count in zip(bars4, successful_keras['Parameters']):
            height = bar.get_height()
            label = f'{param_count/1000:.1f}K' if param_count >= 1000 else f'{param_count}'
            ax4.annotate(label,
                        xy=(bar.get_x() + bar.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha='center', va='bottom', fontsize=10)
    else:
        ax4.text(0.5, 0.5, 'No Keras Models\nAvailable', 
                ha='center', va='center', transform=ax4.transAxes, fontsize=14)
        ax4.set_title('Model Complexity (Neural Networks)', fontsize=14, fontweight='bold')
    
    # Add legend to distinguish model types
    if len(successful_sklearn) > 0 and len(successful_keras) > 0:
        legend_elements = [
            plt.Rectangle((0,0),1,1, facecolor='lightcoral', label='Sklearn Models'),
            plt.Rectangle((0,0),1,1, facecolor='lightblue', label='Keras Models')
        ]
        fig.legend(handles=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.95), ncol=2)
    
    plt.suptitle('Classification Performance Analysis', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.show()
    
    # Print summary statistics
    print("\n VISUALIZATION INSIGHTS:")
    print("=" * 40)
    
    if len(all_accuracies) > 0:
        best_acc_idx = np.argmax(all_accuracies)
        fastest_idx = np.argmin(all_train_times) if len(all_train_times) > 0 else 0
        
        print(f" Best Accuracy: {all_methods[best_acc_idx]} ({all_accuracies[best_acc_idx]:.4f})")
        if len(all_train_times) > 0:
            print(f" Fastest Training: {all_methods[fastest_idx]} ({all_train_times[fastest_idx]:.2f}s)")
        print(f" Average Accuracy: {np.mean(all_accuracies):.4f}")
        if len(all_train_times) > 0:
            print(f"  Average Training Time: {np.mean(all_train_times):.2f}s")
        
        if len(successful_sklearn) > 0:
            sklearn_avg = successful_sklearn['Accuracy'].mean()
            print(f" Sklearn Average: {sklearn_avg:.4f}")
        
        if len(successful_keras) > 0:
            keras_avg = successful_keras['Accuracy'].mean()
            print(f" Keras Average: {keras_avg:.4f}")
    else:
        print("  No accuracy data available to analyze")

else:
    print(" No successful models to visualize")
    print("Please check your model configurations and training process")
    
    # Additional debugging information
    print("\n🔍 Debugging Information:")
    if len(sklearn_df) > 0:
        print("Sklearn DataFrame contains:")
        for col in sklearn_df.columns:
            non_null_count = sklearn_df[col].count()
            print(f"  • {col}: {non_null_count} non-null values")
    
    if len(keras_df) > 0:
        print("Keras DataFrame contains:")
        for col in keras_df.columns:
            non_null_count = keras_df[col].count()
            print(f"  • {col}: {non_null_count} non-null values")
    
    # Check for error messages
    if 'Error' in sklearn_df.columns:
        errors = sklearn_df['Error'].dropna()
        if len(errors) > 0:
            print("\nSklearn Errors:")
            for idx, error in errors.items():
                print(f"  • Row {idx}: {error}")
    
    if 'Error' in keras_df.columns:
        errors = keras_df['Error'].dropna()
        if len(errors) > 0:
            print("\nKeras Errors:")
            for idx, error in errors.items():
                print(f"  • Row {idx}: {error}")

In [None]:
def plot_confusion_matrices(models_dict, model_type="Sklearn"):
    """Plot confusion matrices for all models"""
    n_models = len(models_dict)
    if n_models == 0:
        return
    
    cols = min(3, n_models)
    rows = (n_models + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows))
    
    # Fix axis handling for consistent indexing
    if n_models == 1:
        axes = [[axes]]  # Wrap in nested list for consistent [row][col] access
    elif rows == 1:
        axes = [axes]    # Wrap single row in list
    elif cols == 1:
        axes = [[ax] for ax in axes]  # Wrap single column
    
    for idx, (name, model_data) in enumerate(models_dict.items()):
        row, col = idx // cols, idx % cols
        ax = axes[row][col]  # Consistent access pattern
        
        # Get true labels and predictions
        y_true = y_test_encoded if 'sklearn' in model_type.lower() else y_test_encoded
        y_pred = model_data['predictions']
        
        # Create confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        
        # Plot
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'{name}\nAccuracy: {model_data["results"]["accuracy"]:.3f}')
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
    
    # Hide empty subplots
    for idx in range(n_models, rows * cols):
        row, col = idx // cols, idx % cols
        ax = axes[row][col]
        ax.set_visible(False)
    
    plt.suptitle(f'{model_type} Models - Confusion Matrices', fontsize=16)
    plt.tight_layout()
    plt.show()

## Feature Importance Analysis

### Sklearn Models

In [None]:
def analyze_feature_importance(sklearn_models):
    """Analyze feature importance for models that support it"""
    print(" Feature Importance Analysis")
    print("=" * 40)
    
    for name, model_data in sklearn_models.items():
        classifier = model_data['classifier']
        
        try:
            importance = classifier.get_feature_importance()
            
            # Get top 20 most important features
            top_indices = np.argsort(importance)[-20:]
            top_importance = importance[top_indices]
            
            plt.figure(figsize=(10, 6))
            plt.barh(range(len(top_importance)), top_importance)
            plt.title(f'Top 20 Feature Importance - {name}')
            plt.xlabel('Importance')
            plt.ylabel('Feature Index')
            plt.yticks(range(len(top_importance)), top_indices)
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            print(f"\n{name}:")
            print(f"  Max importance: {np.max(importance):.6f}")
            print(f"  Mean importance: {np.mean(importance):.6f}")
            print(f"  Std importance: {np.std(importance):.6f}")
            
        except ValueError as e:
            print(f"\n{name}: {e}")

if len(sklearn_models) > 0:
    analyze_feature_importance(sklearn_models)

### Neural Network

In [None]:
def plot_training_histories(keras_models):
    """Plot training histories for neural networks"""
    if len(keras_models) == 0:
        return
    
    fig, axes = plt.subplots(len(keras_models), 2, figsize=(15, 5*len(keras_models)))
    if len(keras_models) == 1:
        axes = axes.reshape(1, -1)
    
    for idx, (name, model_data) in enumerate(keras_models.items()):
        history = model_data['history']
        
        # Plot loss
        axes[idx, 0].plot(history.history['loss'], label='Train Loss', color='blue')
        if 'val_loss' in history.history:
            axes[idx, 0].plot(history.history['val_loss'], label='Val Loss', color='red')
        axes[idx, 0].set_title(f'{name} - Loss')
        axes[idx, 0].set_xlabel('Epoch')
        axes[idx, 0].set_ylabel('Loss')
        axes[idx, 0].legend()
        axes[idx, 0].grid(True, alpha=0.3)
        
        # Plot accuracy
        axes[idx, 1].plot(history.history['accuracy'], label='Train Accuracy', color='blue')
        if 'val_accuracy' in history.history:
            axes[idx, 1].plot(history.history['val_accuracy'], label='Val Accuracy', color='red')
        axes[idx, 1].set_title(f'{name} - Accuracy')
        axes[idx, 1].set_xlabel('Epoch')
        axes[idx, 1].set_ylabel('Accuracy')
        axes[idx, 1].legend()
        axes[idx, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

if len(keras_models) > 0:
    plot_training_histories(keras_models)

## Summary and Recommendations

In [None]:
print("\n CLASSIFICATION ANALYSIS SUMMARY")
print("=" * 60)

# Find best performers
all_results = []
if len(successful_sklearn) > 0:
    for _, row in successful_sklearn.iterrows():
        all_results.append({
            'Method': row['Method'],
            'Type': 'Sklearn',
            'Accuracy': row['Accuracy'],
            'Train Time': row['Train Time (s)'],
            'CV Score': row.get('CV Mean', 0)
        })

if len(successful_keras) > 0:
    for _, row in successful_keras.iterrows():
        all_results.append({
            'Method': row['Method'],
            'Type': 'Keras',
            'Accuracy': row['Accuracy'],
            'Train Time': row['Train Time (s)'],
            'CV Score': 0  # Keras doesn't have CV in this analysis
        })

if all_results:
    results_df = pd.DataFrame(all_results)
    
    best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
    fastest_training = results_df.loc[results_df['Train Time'].idxmin()]
    
    print(f"\n Performance Insights:")
    print(f"   • Best Accuracy: {best_accuracy['Method']} ({best_accuracy['Accuracy']:.4f})")
    print(f"   • Fastest Training: {fastest_training['Method']} ({fastest_training['Train Time']:.2f}s)")
    
    sklearn_count = len(successful_sklearn)
    keras_count = len(successful_keras)
    
    print(f"\n Model Statistics:")
    print(f"   • Successful Sklearn models: {sklearn_count}")
    print(f"   • Successful Keras models: {keras_count}")
    print(f"   • Dataset size: {subset_size} samples")
    print(f"   • Embedding dimension: {X_train_embeddings.shape[1]}")
    
    print(f"\n Key Insights:")
    if sklearn_count > 0:
        avg_sklearn_acc = successful_sklearn['Accuracy'].mean()
        print(f"   • Average Sklearn accuracy: {avg_sklearn_acc:.4f}")
    
    if keras_count > 0:
        avg_keras_acc = successful_keras['Accuracy'].mean()
        print(f"   • Average Keras accuracy: {avg_keras_acc:.4f}")
    
    print(f"\n Recommendations:")
    print(f"   • For speed: Use {fastest_training['Method']}")
    print(f"   • For accuracy: Use {best_accuracy['Method']}")

else:
    print("No successful models to analyze.")

print("\n" + "=" * 60)