In [68]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

## Note that 0 FPS means just first and last frames.

In [69]:
# potentially combine paths!
paths = [
    "/tmp/eval_results_2024-12-26_19-26-57.csv",
    "/tmp/eval_results_2024-12-26_19-58-11.csv",
    "/tmp/eval_results_2024-12-26_20-27-29.csv",
    "/tmp/eval_results_2024-12-26_20-42-07.csv",
    "/tmp/eval_results_gpt-4o-mini_2024-12-26_23-38-09.csv",
    "/tmp/eval_results_gpt-4o_2024-12-27_01-42-46.csv",
    "/tmp/eval_results_gemini-2.0-flash-exp_2024-12-27_02-14-20.csv",
    "/tmp/eval_results_gemini-1.5-pro_2024-12-27_03-01-14.csv",
    "/tmp/eval_results_gpt-4o_2024-12-27_21-18-54.csv",
    "/tmp/eval_results_gpt-4o-mini_2024-12-27_21-24-10.csv",
    "/tmp/eval_results_gemini-1.5-pro_2024-12-27_21-31-39.csv",
    "/tmp/eval_results_gemini-2.0-flash-exp_2024-12-27_21-36-22.csv"
]

In [70]:
df = pd.concat([pd.read_csv(path) for path in paths])
df['label'] = df['success_flag'].apply(lambda x: 1 if x== 'success' else 0)
df['accuracy_of_mean'] = df['label'] == (df['mean_performance'] > 0.5)
df['accuracy_of_median'] = df['label'] == (df['median_performance'] > 0.5)

df['vote_str'] = df['performance']
df['votes_float'] = df['performance'].apply(lambda x: json.loads(x))

In [None]:
print(df['accuracy_of_mean'].mean(), df['accuracy_of_median'].mean(), df['votes_float'].apply(lambda x: len(x)).mean())
df.head()

In [72]:
# Helper function for computing means and std errors
def mean_stderr(data):
    mean = np.mean(data)
    stderr = np.std(data) / np.sqrt(len(data))
    return mean, stderr

In [None]:
# performance of mean vs median for each model 
# Group by VLM and compute accuracy for both mean and median methods
grouped_acc = df.groupby('vlm').agg({
    'accuracy_of_mean': ['mean', 'std'],
    'accuracy_of_median': ['mean', 'std']
}).round(3)

# Create bar plot comparing mean vs median performance
plt.figure(figsize=(10, 6))
x = np.arange(len(grouped_acc.index))
width = 0.35

plt.bar(x - width/2, grouped_acc['accuracy_of_mean']['mean'], width, 
        label='Mean threshold', 
        yerr=grouped_acc['accuracy_of_mean']['std'],
        capsize=5)
plt.bar(x + width/2, grouped_acc['accuracy_of_median']['mean'], width,
        label='Median threshold',
        yerr=grouped_acc['accuracy_of_median']['std'], 
        capsize=5)

plt.ylabel('Accuracy')
plt.title('Performance Metric Comparison by Model')
plt.xticks(x, grouped_acc.index, rotation=45)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# now, what about as we change the number of votes? 1 vs 3 vs 5? get the votes from the 'performance' column. some are nan / not full length, just reuse

# Create a new dataframe for vote analysis
vote_analysis = pd.DataFrame()

# Extract performance lists and analyze different numbers of votes
for idx, row in df.iterrows():
    perf = row['votes_float']
    if isinstance(perf, list):
        # For each number of votes (1, 3, 5)
        for n_votes in [1, 3, 5]:
            # Take first n_votes if available
            votes = perf[:n_votes]
            if len(votes) >= n_votes:
                # Calculate mean performance with this many votes
                mean_perf = np.mean(votes)
                vote_analysis = pd.concat([vote_analysis, pd.DataFrame({
                    'vlm': [row['vlm']],
                    'task': [row['task']],
                    'n_votes': [n_votes],
                    'mean_performance': [mean_perf],
                    'true_success': [1 if row['success_flag'] == 'success' else 0]
                })])
vote_analysis['accuracy_of_mean'] = vote_analysis['true_success'] == (vote_analysis['mean_performance'] > 0.5)

# Calculate accuracy for each VLM and number of votes
vote_results = vote_analysis.groupby(['vlm', 'n_votes']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Plot results
plt.figure(figsize=(12, 6))
for vlm in vote_results['vlm'].unique():
    vlm_data = vote_results[vote_results['vlm'] == vlm]
    plt.errorbar(vlm_data['n_votes'], vlm_data['accuracy'], 
                yerr=vlm_data['std_err'],
                label=vlm, marker='o', capsize=5)

plt.xlabel('Number of Votes')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Number of Votes by Model')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:

# Calculate accuracy for each VLM and FPS
fps_results = df.groupby(['vlm', 'fps']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

In [None]:
fps_model_mean_accs = df.groupby(['vlm', 'fps']).apply(
    lambda x: x['accuracy_of_mean'].mean()
).reset_index(name='accuracy')

fps_model_median_accs = df.groupby(['vlm', 'fps']).apply(
    lambda x: x['accuracy_of_median'].mean()
).reset_index(name='accuracy')

In [None]:
# Create figure
plt.figure(figsize=(10, 6))

# Plot both mean and median accuracy on same plot
for i, vlm in enumerate(fps_model_mean_accs['vlm'].unique()):
    # Plot mean accuracy
    vlm_data = fps_model_mean_accs[fps_model_mean_accs['vlm'] == vlm]
    plt.plot(vlm_data['fps'], vlm_data['accuracy'], marker='o', label=f'{vlm} (mean)', linestyle='-')
    
    # Plot median accuracy
    vlm_data = fps_model_median_accs[fps_model_median_accs['vlm'] == vlm]
    plt.plot(vlm_data['fps'] , vlm_data['accuracy'], marker='s', label=f'{vlm} (median)', linestyle='--')

plt.xlabel('FPS')
plt.ylabel('Accuracy')
plt.title('Performance Accuracy vs FPS')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# per task performance
# Group by task and calculate mean accuracy
task_results = df.groupby(['task']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Create bar plot
plt.figure(figsize=(12, 6))
bars = plt.bar(task_results['task'], task_results['accuracy'])
plt.errorbar(task_results['task'], task_results['accuracy'], 
             yerr=task_results['std_err'], fmt='none', color='black', capsize=5)

# add horizontal line at 0.5
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Chance')
# Customize plot
plt.xlabel('Task')
plt.ylabel('Accuracy')
plt.title('Performance Accuracy by Task')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.legend()
plt.show()

print("\nTask-wise Performance:")
print(task_results.to_string(index=False))


In [None]:
# Group by task and VLM to calculate mean accuracy
task_model_results = df.groupby(['task', 'vlm']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

plt.figure(figsize=(12, 6))

# Get unique tasks and VLMs
# Get unique tasks and VLMs
tasks = task_model_results['task'].unique()
vlms = task_model_results['vlm'].unique()
x = np.arange(len(tasks))
width = 0.8 / len(vlms)  # Width of bars with spacing

# Plot bars for each VLM
for i, vlm in enumerate(vlms):
    vlm_data = task_model_results[task_model_results['vlm'] == vlm]
    # Calculate offset for this VLM's bars
    offset = (i - len(vlms)/2 + 0.5) * width
    # Match tasks with the current VLM's data
    accuracies = []
    positions = []
    errors = []
    
    for task_idx, task in enumerate(tasks):
        task_data = vlm_data[vlm_data['task'] == task]
        if not task_data.empty:
            positions.append(task_idx + offset)
            accuracies.append(task_data['accuracy'].iloc[0])
            errors.append(task_data['std_err'].iloc[0])
    
    plt.bar(positions, accuracies, width, label=vlm)
    plt.errorbar(positions, accuracies,
                yerr=errors, fmt='none', color='black', capsize=3)

# Customize plot
plt.xlabel('Task')
plt.ylabel('Accuracy')
plt.title('Performance Accuracy by Task and Model')
plt.xticks(x, tasks, rotation=45, ha='right')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()


In [None]:
# Group by task, VLM, and FPS to calculate mean accuracy
task_model_fps_results = df.groupby(['task', 'vlm', 'fps']).apply(
    lambda x: pd.Series({
        'accuracy': x['accuracy_of_mean'].mean(),
        'std_err': x['accuracy_of_mean'].std() / np.sqrt(len(x))
    })
).reset_index()

# Get unique values
tasks = task_model_fps_results['task'].unique()
vlms = task_model_fps_results['vlm'].unique()
fps_values = sorted(task_model_fps_results['fps'].unique())

# Create subplot for each task
fig, axes = plt.subplots(len(tasks), 1, figsize=(12, 5*len(tasks)))
if len(tasks) == 1:
    axes = [axes]

for task_idx, (task, ax) in enumerate(zip(tasks, axes)):
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    x = np.arange(len(fps_values))
    width = 0.8 / len(vlms)  # Width of bars with spacing

    # Plot bars for each VLM
    for i, vlm in enumerate(vlms):
        # Calculate offset for this VLM's bars
        offset = (i - len(vlms)/2 + 0.5) * width
        
        accuracies = []
        errors = []
        positions = []
        
        for j, fps in enumerate(fps_values):
            vlm_data = task_data[(task_data['vlm'] == vlm) & (task_data['fps'] == fps)]
            if not vlm_data.empty:
                positions.append(j + offset)
                accuracies.append(vlm_data['accuracy'].iloc[0])
                errors.append(vlm_data['std_err'].iloc[0])
        
        ax.bar(positions, accuracies, width, 
               label=vlm,
               alpha=0.7)
        
        ax.errorbar(positions, accuracies,
                   yerr=errors, fmt='none', color='black', capsize=3)

    # Customize each subplot
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Task: {task}')
    ax.set_xticks(x)
    ax.set_xticklabels([f'{fps} FPS' for fps in fps_values])
    ax.grid(True, alpha=0.3)
    ax.legend()
    
    # Add horizontal line at 0.5 for random chance
    ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='Random Chance')

plt.tight_layout()
plt.show()

# Print best combinations
print("\nBest performing combinations for each task:")
for task in tasks:
    task_data = task_model_fps_results[task_model_fps_results['task'] == task]
    best_row = task_data.loc[task_data['accuracy'].idxmax()]
    print(f"\n{task}:")
    print(f"  VLM: {best_row['vlm']}")
    print(f"  FPS: {best_row['fps']}")
    print(f"  Accuracy: {best_row['accuracy']:.3f} ± {best_row['std_err']:.3f}")