# Model Performance Comparison Analysis

Interactive comparison of mBERT, XLM-RoBERTa, and RemBERT models

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("✓ Libraries loaded successfully")

## 1. Load Data

In [None]:
# Model performance data
data = {
    'Model': ['mBERT', 'XLM-R', 'RemBERT'],
    'sent_acc': [0.981633, 0.963776, 0.975420],
    'sent_prec': [0.983863, 0.970607, 0.978235],
    'sent_rec': [0.986341, 0.973066, 0.980156],
    'sent_f1': [0.985077, 0.971761, 0.979142],
    'pol_acc': [0.845918, 0.870408, 0.858210],
    'pol_prec': [0.820192, 0.840429, 0.832145],
    'pol_rec': [0.893018, 0.902585, 0.897850],
    'pol_f1': [0.846428, 0.866122, 0.857320],
    'macro_f1_avg': [0.915752, 0.918941, 0.917231],
    'runtime': [4.2996, 4.423, 16.8042],
    'training_time': ['1.9h 52m', '2.5h 30m', '9.5h 31m'],
    'training_mins': [112, 150, 571]  # in minutes
}

df = pd.DataFrame(data)
df

## 2. Summary Statistics

In [None]:
# Display key metrics
display_cols = ['Model', 'sent_f1', 'pol_f1', 'macro_f1_avg', 'training_time']
df[display_cols].style.background_gradient(subset=['sent_f1', 'pol_f1', 'macro_f1_avg'], cmap='RdYlGn')

## 3. Sentiment Analysis Performance

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Sentiment Analysis Metrics Comparison', fontsize=16, fontweight='bold')

metrics = ['sent_acc', 'sent_prec', 'sent_rec', 'sent_f1']
titles = ['Sentiment Accuracy', 'Sentiment Precision', 'Sentiment Recall', 'Sentiment F1']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[idx // 2, idx % 2]
    bars = ax.bar(df['Model'], df[metric], color=colors)
    ax.set_ylabel('Score', fontweight='bold')
    ax.set_title(title, fontweight='bold')
    ax.set_ylim([0.94, 1.0])
    ax.grid(axis='y', alpha=0.3)
    
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.4f}',
               ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 4. Polarization Detection Performance

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Polarization Detection Metrics Comparison', fontsize=16, fontweight='bold')

metrics = ['pol_acc', 'pol_prec', 'pol_rec', 'pol_f1', 'macro_f1_avg']
titles = ['Polarization Accuracy', 'Polarization Precision', 'Polarization Recall', 
          'Polarization F1', 'Macro F1 Average']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    ax = axes[idx // 3, idx % 3]
    bars = ax.bar(df['Model'], df[metric], color=colors)
    ax.set_ylabel('Score', fontweight='bold')
    ax.set_title(title, fontweight='bold')
    ax.set_ylim([0.80, 1.0])
    ax.grid(axis='y', alpha=0.3)
    
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.4f}',
               ha='center', va='bottom', fontsize=10)

axes[1, 2].axis('off')
plt.tight_layout()
plt.show()

## 5. Overall Performance Radar Chart

In [None]:
categories = ['Sentiment F1', 'Polarization F1', 'Sent Accuracy', 'Pol Accuracy', 'Macro F1']

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

N = len(categories)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=11)
ax.set_ylim(0.80, 1.0)

for idx, row in df.iterrows():
    values = [row['sent_f1'], row['pol_f1'], row['sent_acc'], row['pol_acc'], row['macro_f1_avg']]
    values += values[:1]
    
    ax.plot(angles, values, 'o-', linewidth=2, label=row['Model'], color=colors[idx])
    ax.fill(angles, values, alpha=0.15, color=colors[idx])

ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=12)
ax.set_title('Overall Model Performance Comparison', fontsize=14, fontweight='bold', pad=20)
ax.grid(True)

plt.tight_layout()
plt.show()

## 6. Training Efficiency Analysis

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

for idx, row in df.iterrows():
    ax.scatter(row['training_mins'], row['macro_f1_avg'], s=500, 
              label=row['Model'], alpha=0.6, color=colors[idx])
    ax.annotate(row['Model'], (row['training_mins'], row['macro_f1_avg']),
               fontsize=12, fontweight='bold',
               xytext=(10, 10), textcoords='offset points')

ax.set_xlabel('Training Time (minutes)', fontsize=12, fontweight='bold')
ax.set_ylabel('Macro F1 Average', fontsize=12, fontweight='bold')
ax.set_title('Training Efficiency: Performance vs Time', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend(fontsize=11)

plt.tight_layout()
plt.show()

## 7. Metrics Heatmap

In [None]:
# Prepare data for heatmap
heatmap_data = df[['Model', 'sent_acc', 'sent_prec', 'sent_rec', 'sent_f1',
                    'pol_acc', 'pol_prec', 'pol_rec', 'pol_f1', 'macro_f1_avg']].set_index('Model')

fig, ax = plt.subplots(figsize=(12, 6))

sns.heatmap(heatmap_data.T, annot=True, fmt='.4f', cmap='RdYlGn', 
           center=0.9, vmin=0.80, vmax=1.0, ax=ax,
           cbar_kws={'label': 'Score'})

ax.set_title('Performance Metrics Heatmap', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Metrics', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 8. Best Model Per Metric

In [None]:
metrics_to_check = ['sent_acc', 'sent_f1', 'pol_acc', 'pol_f1', 'macro_f1_avg']
best_models = {}

for metric in metrics_to_check:
    best_idx = df[metric].idxmax()
    best_model = df.loc[best_idx, 'Model']
    best_score = df.loc[best_idx, metric]
    best_models[metric] = {'model': best_model, 'score': best_score}

best_df = pd.DataFrame(best_models).T
best_df.columns = ['Best Model', 'Score']
best_df

## 9. Conclusion

### Key Findings:

1. **XLM-R** achieves the best overall performance
   - Macro F1: **91.89%**
   - Polarization F1: **86.61%**
   - Moderate training time: **2.5 hours**
   - Best balance of performance and efficiency

2. **mBERT** excels in sentiment analysis
   - Sentiment F1: **98.51%**
   - Fastest training time: **1.9 hours**
   - Strong overall performance: 91.58% macro F1

3. **RemBERT** performs between mBERT and XLM-R
   - Sentiment F1: **97.91%**
   - Polarization F1: **85.73%**
   - Macro F1: **91.72%**
   - Longest training time (9.5 hours) without top performance