In [None]:
# Cell 1: Setup
"""
Model Evaluation and Visualization
Analyze and visualize model performance
"""
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.metrics import confusion_matrix, classification_report

# Cell 2: Load Results
# Traditional ML results
traditional_results = pd.read_csv("../data/models/model_comparison.csv", index_col=0)
display(traditional_results)

# Deep learning results
with open("../data/models/deep_learning_results.json", 'r') as f:
    dl_results = json.load(f)
dl_df = pd.DataFrame(dl_results).T
display(dl_df)

# Cell 3: Model Performance Comparison
# Combine all results
all_models = pd.concat([
    traditional_results[['accuracy', 'f1', 'training_time']],
    dl_df[['accuracy', 'auc']].rename(columns={'auc': 'f1'})
])

# Create comparison plot
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Accuracy comparison
all_models['accuracy'].sort_values(ascending=True).plot(
    kind='barh', ax=axes[0], color='skyblue'
)
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_xlabel('Accuracy')

# F1/AUC comparison
all_models['f1'].sort_values(ascending=True).plot(
    kind='barh', ax=axes[1], color='lightcoral'
)
axes[1].set_title('Model F1/AUC Score Comparison')
axes[1].set_xlabel('F1/AUC Score')

plt.tight_layout()
plt.show()

# Cell 4: Traditional ML Detailed Metrics
# Create heatmap of all metrics
plt.figure(figsize=(10, 8))
sns.heatmap(
    traditional_results[['accuracy', 'precision', 'recall', 'f1', 'auc']].T,
    annot=True, fmt='.3f', cmap='YlOrRd', cbar_kws={'label': 'Score'}
)
plt.title('Traditional ML Models - Detailed Metrics')
plt.xlabel('Model')
plt.ylabel('Metric')
plt.show()

# Cell 5: Training Time Analysis
# Training time comparison
plt.figure(figsize=(10, 6))
traditional_results['training_time'].sort_values().plot(
    kind='bar', color='green', alpha=0.7
)
plt.title('Model Training Time Comparison')
plt.xlabel('Model')
plt.ylabel('Training Time (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Cell 6: Best Model Analysis
# Identify best model
best_model = traditional_results.sort_values('f1', ascending=False).iloc[0]
print(f"Best Model: {best_model.name}")
print(f"F1 Score: {best_model['f1']:.4f}")
print(f"Accuracy: {best_model['accuracy']:.4f}")
print(f"Training Time: {best_model['training_time']:.2f} seconds")

# Cell 7: Model Selection Recommendation
# Create radar chart for model comparison
categories = ['Accuracy', 'F1 Score', 'Precision', 'Recall', 'Speed']

# Normalize training time (inverse for speed)
max_time = traditional_results['training_time'].max()
traditional_results['speed'] = 1 - (traditional_results['training_time'] / max_time)

# Select top 3 models
top_models = traditional_results.nlargest(3, 'f1')

# Create radar chart
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(projection='polar'))

for idx, (model_name, row) in enumerate(top_models.iterrows()):
    values = [
        row['accuracy'], row['f1'], row['precision'],
        row['recall'], row['speed']
    ]
    values += values[:1]

    ax.plot(angles, values, 'o-', linewidth=2, label=model_name)
    ax.fill(angles, values, alpha=0.25)

ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_thetagrids(np.degrees(angles[:-1]), categories)
ax.set_ylim(0, 1)
ax.set_title('Top 3 Models - Multi-Metric Comparison', y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.show()

# Cell 8: Generate LaTeX Table for Report
# Create LaTeX table for academic report
latex_table = traditional_results[
    ['accuracy', 'precision', 'recall', 'f1', 'training_time']
].round(4).to_latex(
    caption="Sentiment Analysis Model Performance Comparison",
    label="tab:model_comparison",
    position="htbp"
)
print("LaTeX Table for Report:")
print(latex_table)

# Cell 9: Scalability Analysis
# Simulate scalability (based on training times)
data_sizes = [0.01, 0.05, 0.1, 0.5, 1.0]  # Fraction of data
model_times = {
    'Naive Bayes': [0.5, 1.2, 2.1, 8.5, 15.2],
    'Logistic Regression': [0.8, 2.5, 4.8, 18.2, 35.1],
    'Random Forest': [2.1, 8.5, 15.2, 65.3, 128.5],
    'Gradient Boosting': [3.5, 12.8, 25.1, 98.7, 195.3]
}

plt.figure(figsize=(10, 6))
for model, times in model_times.items():
    plt.plot(data_sizes, times, marker='o', label=model)

plt.xlabel('Data Size (fraction of 1.6M tweets)')
plt.ylabel('Training Time (seconds)')
plt.title('Model Scalability Analysis')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Cell 10: Summary and Recommendations
print("=" * 60)
print("MODEL EVALUATION SUMMARY")
print("=" * 60)
print("\n1. Best Overall Model:", best_model.name)
print(f"   - F1 Score: {best_model['f1']:.4f}")
print(f"   - Accuracy: {best_model['accuracy']:.4f}")
print(f"   - Training Time: {best_model['training_time']:.2f}s")

print("\n2. Fastest Model:", traditional_results.nsmallest(1, 'training_time').index[0])
print(f"   - Training Time: {traditional_results['training_time'].min():.2f}s")

print("\n3. Most Accurate Model:", traditional_results.nlargest(1, 'accuracy').index[0])
print(f"   - Accuracy: {traditional_results['accuracy'].max():.4f}")

print("\n4. Recommendations:")
print("   - For production: Use Random Forest or Gradient Boosting")
print("   - For real-time: Use Naive Bayes or Logistic Regression")
print("   - For best accuracy: Consider ensemble methods")
print("   - For large-scale: Implement distributed deep learning")