# LLM Screening Performance Dashboard

Interactive visualisation comparing model performance across evaluation runs.

- **Left Panel**: Model rankings sorted by F1 score
- **Right Panel**: Confusion matrix for selected model

In [1]:
# Install required packages
%pip install -q ipywidgets pandas matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import ipywidgets as widgets
from IPython.display import display, clear_output

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

In [8]:
# Load and process evaluation data
DATA_DIR = Path.cwd().parent / "Data" if not (Path.cwd() / "Data").exists() else Path.cwd() / "Data"
RESULTS_DIR = DATA_DIR / "results"

# Define file mappings for each run
RUN_FILES = {
    'Run 1': {
        'Mistral Zero-Shot': 'eval_mistral_zero_shot_20260115_231802.csv',
        'Mistral CoT': 'eval_mistral_cot_20260116_003208.csv',
        'Llama 3.2 Zero-Shot': 'eval_llama3.2_zero_shot_20260115_193605.csv',
        'Llama 3.2 CoT': 'eval_llama3.2_cot_20260115_215209.csv'
    },
    'Run 2': {
        'Mistral Zero-Shot': 'eval_mistral_zero_shot_20260116_050656.csv',
        'Mistral CoT': 'eval_mistral_cot_20260116_073058.csv',
        'Llama 3.2 Zero-Shot': 'eval_llama3.2_zero_shot_20260116_025453.csv',
        'Llama 3.2 CoT': 'eval_llama3.2_cot_20260116_041136.csv'
    }
}

def load_eval_data(run_name, model_name):
    """Load evaluation data for a specific run and model."""
    filepath = RESULTS_DIR / RUN_FILES[run_name][model_name]
    df = pd.read_csv(filepath)
    
    # Handle column name differences between runs
    if 'label' in df.columns and 'true_label' not in df.columns:
        df['true_label'] = df['label']
    
    # Convert predictions to binary - handle both numeric and text formats
    def convert_prediction(x):
        if pd.isna(x):
            return 0  # Default to exclude for missing predictions
        if isinstance(x, (int, float)):
            return int(x)
        return 1 if 'include' in str(x).lower() else 0
    
    df['pred_binary'] = df['prediction'].apply(convert_prediction)
    
    return df

def compute_metrics(run_name):
    """Compute metrics for all models in a given run."""
    results = []
    
    for model_name in RUN_FILES[run_name].keys():
        df = load_eval_data(run_name, model_name)
        
        y_true = df['true_label']
        y_pred = df['pred_binary']
        
        results.append({
            'Model': model_name,
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred, zero_division=0),
            'Recall': recall_score(y_true, y_pred, zero_division=0),
            'F1 Score': f1_score(y_true, y_pred, zero_division=0),
            'N': len(df)
        })
    
    return pd.DataFrame(results).sort_values('F1 Score', ascending=False).reset_index(drop=True)

# Pre-compute metrics for both runs
METRICS = {
    'Run 1': compute_metrics('Run 1'),
    'Run 2': compute_metrics('Run 2')
}

print("Data loaded successfully!")
print(f"Run 1 models: {list(RUN_FILES['Run 1'].keys())}")
print(f"Run 2 models: {list(RUN_FILES['Run 2'].keys())}")

Data loaded successfully!
Run 1 models: ['Mistral Zero-Shot', 'Mistral CoT', 'Llama 3.2 Zero-Shot', 'Llama 3.2 CoT']
Run 2 models: ['Mistral Zero-Shot', 'Mistral CoT', 'Llama 3.2 Zero-Shot', 'Llama 3.2 CoT']


In [11]:
# Color scheme
COLORS = {
    'primary': '#2E86AB',
    'secondary': '#A23B72',
    'accent': '#F18F01',
    'success': '#28a745',
    'danger': '#dc3545',
    'light_bg': '#f8f9fa',
    'dark_text': '#2c3e50',
    'muted': '#6c757d'
}

MODEL_COLORS = {
    'Mistral Zero-Shot': '#2E86AB',
    'Mistral CoT': '#5BA3C6',
    'Llama 3.2 Zero-Shot': '#A23B72',
    'Llama 3.2 CoT': '#C66B9A'
}

def create_ranking_chart(ax, metrics_df, run_name):
    """Create a horizontal bar chart showing F1 rankings."""
    ax.clear()
    
    # Sort by F1 (already sorted, but ensure order for plotting)
    df = metrics_df.sort_values('F1 Score', ascending=True)
    
    # Create bars
    y_pos = np.arange(len(df))
    colors = [MODEL_COLORS.get(m, COLORS['primary']) for m in df['Model']]
    
    bars = ax.barh(y_pos, df['F1 Score'], color=colors, edgecolor='white', linewidth=1.5, height=0.6)
    
    # Add value labels
    for i, (bar, f1, prec, rec) in enumerate(zip(bars, df['F1 Score'], df['Precision'], df['Recall'])):
        # F1 score on the bar
        ax.text(bar.get_width() - 0.02, bar.get_y() + bar.get_height()/2,
                f'{f1:.1%}', ha='right', va='center', fontweight='bold', 
                fontsize=14, color='white')
        # Precision/Recall annotation
        ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                f'P: {prec:.1%} | R: {rec:.1%}', ha='left', va='center',
                fontsize=10, color=COLORS['muted'])
    
    # Styling
    ax.set_yticks(y_pos)
    ax.set_yticklabels(df['Model'], fontsize=12, fontweight='500')
    ax.set_xlim(0, 1.30)
    ax.set_xlabel('F1 Score', fontsize=12, fontweight='500', color=COLORS['dark_text'])
    ax.set_title(f'Model Rankings ({run_name})', fontsize=16, fontweight='bold', 
                 color=COLORS['dark_text'], pad=15)
    
    # Add rank numbers on the right side
    for i, (idx, row) in enumerate(df.iloc[::-1].iterrows()):
        rank = len(df) - i
        ax.text(1.22, i, f'#{rank}', ha='center', va='center', 
                fontsize=11, fontweight='bold', color=COLORS['accent'],
                bbox=dict(boxstyle='circle,pad=0.3', facecolor='white', edgecolor=COLORS['accent']))
    
    # Clean up spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.tick_params(left=False)
    ax.set_facecolor('white')


def create_confusion_matrix(ax, run_name, model_name):
    """Create a styled confusion matrix."""
    ax.clear()
    
    # Load data and compute confusion matrix
    df = load_eval_data(run_name, model_name)
    y_true = df['true_label']
    y_pred = df['pred_binary']
    
    cm = confusion_matrix(y_true, y_pred)
    
    # Calculate percentages
    cm_pct = cm.astype('float') / cm.sum() * 100
    
    # Create custom colormap
    cmap = sns.color_palette("Blues", as_cmap=True)
    
    # Plot heatmap
    sns.heatmap(cm, annot=False, cmap=cmap, ax=ax, cbar=False,
                linewidths=3, linecolor='white', square=True)
    
    # Add custom annotations
    labels = [['TN', 'FP'], ['FN', 'TP']]
    for i in range(2):
        for j in range(2):
            count = cm[i, j]
            pct = cm_pct[i, j]
            label = labels[i][j]
            
            # Determine text color based on cell value
            text_color = 'white' if count > cm.max() * 0.5 else COLORS['dark_text']
            
            ax.text(j + 0.5, i + 0.35, label, ha='center', va='center',
                   fontsize=14, fontweight='bold', color=text_color, alpha=0.7)
            ax.text(j + 0.5, i + 0.55, f'{count:,}', ha='center', va='center',
                   fontsize=20, fontweight='bold', color=text_color)
            ax.text(j + 0.5, i + 0.75, f'({pct:.1f}%)', ha='center', va='center',
                   fontsize=11, color=text_color, alpha=0.8)
    
    # Labels
    ax.set_xlabel('Predicted Label', fontsize=12, fontweight='500', color=COLORS['dark_text'], labelpad=10)
    ax.set_ylabel('True Label', fontsize=12, fontweight='500', color=COLORS['dark_text'], labelpad=10)
    ax.set_xticklabels(['Exclude', 'Include'], fontsize=11)
    ax.set_yticklabels(['Exclude', 'Include'], fontsize=11, rotation=0)
    
    # Title with model color
    model_color = MODEL_COLORS.get(model_name, COLORS['primary'])
    ax.set_title(f'Confusion Matrix\n{model_name}', fontsize=14, fontweight='bold',
                color=model_color, pad=15)
    
    # Add metrics summary below
    metrics = METRICS[run_name]
    row = metrics[metrics['Model'] == model_name].iloc[0]
    
    metrics_text = f"Accuracy: {row['Accuracy']:.1%}  |  Precision: {row['Precision']:.1%}  |  Recall: {row['Recall']:.1%}  |  F1: {row['F1 Score']:.1%}"
    ax.text(1, -0.18, metrics_text, ha='center', va='top', transform=ax.transAxes,
           fontsize=10, color=COLORS['muted'], style='italic')


print("Visualization functions defined!")

Visualization functions defined!


In [12]:
# Create interactive dashboard

# Widgets
run_dropdown = widgets.Dropdown(
    options=['Run 1', 'Run 2'],
    value='Run 2',
    description='Evaluation Run:',
    style={'description_width': '100px'},
    layout=widgets.Layout(width='250px')
)

model_dropdown = widgets.Dropdown(
    options=list(RUN_FILES['Run 2'].keys()),
    value='Mistral Zero-Shot',
    description='Model:',
    style={'description_width': '100px'},
    layout=widgets.Layout(width='250px')
)

# Output area
output = widgets.Output()

def update_dashboard(run_name, model_name):
    """Update the entire dashboard."""
    with output:
        clear_output(wait=True)
        
        # Create figure with landscape orientation
        fig, (ax_ranking, ax_cm) = plt.subplots(1, 2, figsize=(14, 6), 
                                                 gridspec_kw={'width_ratios': [1.3, 1]})
        fig.patch.set_facecolor('white')
        
        # Left: Rankings
        create_ranking_chart(ax_ranking, METRICS[run_name], run_name)
        
        # Right: Confusion Matrix
        create_confusion_matrix(ax_cm, run_name, model_name)
        
        plt.tight_layout(pad=3)
        plt.show()

def on_run_change(change):
    """Handle run dropdown change."""
    update_dashboard(run_dropdown.value, model_dropdown.value)

def on_model_change(change):
    """Handle model dropdown change."""
    update_dashboard(run_dropdown.value, model_dropdown.value)

# Link widgets to handlers
run_dropdown.observe(on_run_change, names='value')
model_dropdown.observe(on_model_change, names='value')

# Layout
controls = widgets.HBox(
    [run_dropdown, model_dropdown],
    layout=widgets.Layout(
        justify_content='center',
        padding='15px',
        margin='10px 0'
    )
)

# Title
title = widgets.HTML(
    value="<h2 style='text-align: center; color: #2c3e50; font-family: Segoe UI, sans-serif; margin-bottom: 5px;'>" +
          "ðŸ”¬ LLM Screening Performance Dashboard</h2>" +
          "<p style='text-align: center; color: #6c757d; font-size: 14px;'>" +
          "Compare model performance across evaluation runs</p>"
)

# Display dashboard
dashboard = widgets.VBox([title, controls, output])
display(dashboard)

# Initial render
update_dashboard('Run 2', 'Mistral Zero-Shot')

VBox(children=(HTML(value="<h2 style='text-align: center; color: #2c3e50; font-family: Segoe UI, sans-serif; mâ€¦

In [13]:
# Export visualization to HTML file
import base64
from io import BytesIO

def save_dashboard_html(run_name='Run 2', model_name='Mistral Zero-Shot'):
    """Save the dashboard as a standalone HTML file."""
    
    # Create figure
    fig, (ax_ranking, ax_cm) = plt.subplots(1, 2, figsize=(16, 7), 
                                             gridspec_kw={'width_ratios': [1.3, 1]})
    fig.patch.set_facecolor('white')
    
    # Create visualizations
    create_ranking_chart(ax_ranking, METRICS[run_name], run_name)
    create_confusion_matrix(ax_cm, run_name, model_name)
    
    plt.tight_layout(pad=3)
    
    # Save figure to bytes
    buffer = BytesIO()
    fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    buffer.seek(0)
    img_base64 = base64.b64encode(buffer.read()).decode('utf-8')
    plt.close(fig)
    
    # Create HTML
    html_content = f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>LLM Screening Performance Dashboard</title>
    <style>
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: #f8f9fa;
            margin: 0;
            padding: 20px;
            display: flex;
            flex-direction: column;
            align-items: center;
        }}
        .container {{
            max-width: 1400px;
            background: white;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
            padding: 30px;
        }}
        h1 {{
            color: #2c3e50;
            text-align: center;
            margin-bottom: 5px;
        }}
        .subtitle {{
            color: #6c757d;
            text-align: center;
            font-size: 14px;
            margin-bottom: 25px;
        }}
        .dashboard-img {{
            width: 100%;
            height: auto;
            border-radius: 10px;
        }}
        .footer {{
            text-align: center;
            margin-top: 20px;
            color: #888;
            font-size: 12px;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>ðŸ”¬ LLM Screening Performance Dashboard</h1>
        <p class="subtitle">Evaluation Run: {run_name} | Confusion Matrix: {model_name}</p>
        <img class="dashboard-img" src="data:image/png;base64,{img_base64}" alt="Dashboard">
        <div class="footer">
            LSE-UKHSA Systematic Review Screening Project
        </div>
    </div>
</body>
</html>
'''
    
    # Save to root folder
    output_path = Path.cwd().parent / "model_performance_dashboard.html"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"âœ… Dashboard saved to: {output_path}")
    print(f"   File size: {output_path.stat().st_size / 1024:.1f} KB")
    return output_path

# Save the dashboard
save_dashboard_html('Run 2', 'Mistral Zero-Shot')

âœ… Dashboard saved to: c:\Users\juanx\Documents\LSE-UKHSA Project\model_performance_dashboard.html
   File size: 184.9 KB


WindowsPath('c:/Users/juanx/Documents/LSE-UKHSA Project/model_performance_dashboard.html')