In [1]:
WORD_NUM = 15 # MNRead has 15 words at most per line
penalty = 0.01

# Load in Data

In [2]:
import json
from pathlib import Path
import pandas as pd
import ast
from collections import Counter
from tqdm import tqdm

In [3]:
def count_matches_per_row(row):
    left_side = pd.Series(row['text'].lower().split(' '))
    right_side = pd.Series([i.lower() for i in row['rec_texts']])
    
    counter_left = Counter(left_side.dropna()) # dic
    counter_right = Counter(right_side.dropna())
    
    # 计算两侧相同元素的个数，确保每个元素只计算一次
    matches = sum((counter_left & counter_right).values())
    total = sum(counter_left.values())
    
    # char_level
    counter_left_char = Counter(''.join(left_side.dropna().tolist()))
    counter_right_char = Counter(''.join(right_side.dropna().tolist()))
    matches_char = sum((counter_left_char & counter_right_char).values())
    total_char = sum(counter_left_char.values())
    return [matches, total - matches, total_char-matches_char]


In [4]:
filter_df = pd.read_csv('../../data/human/SelectedFilter.csv')

In [13]:
output_dir = Path("../../filtered/mnread")
gt_dir = Path("../../data/mnread")
files = list(output_dir.rglob('*.*'))
result_li = []
ba_li = []
df_human = pd.read_csv('../../data/human/human_mnread_acuity.csv')
df_human.rename(columns={'acuity': 'human'}, inplace=True)
# Load the ground truth COCO JSON (annotations only or full dataset)
with open(gt_dir/"anno.json", "r") as f:
    ground_truth = json.load(f)
# Here we assume ground_truth has a key "annotations"
gt_annotations = ground_truth.get("annotations", [])
img_info = ground_truth.get("images", [])

for model_file in tqdm(files, total=len(files), desc="Rinse model outputs"):
    # Load the model output JSON (replace with your file or object)
    # model_file = output_dir/ "gpt4o.json"
    model_name = model_file.stem
    with open(model_file, "r") as f:
        model_output = json.load(f)

    # Create lookup dictionaries for ground truth annotations and image info by image_id.
    # If there are multiple ground truth annotations per image, you might need to store a list.
    gt_by_chart = {}
    for info, ann in zip(img_info, gt_annotations):
        id = ann["id"]
        full = {}
        full["image_id"] = ann["image_id"]
        full["file_name"] = info["file_name"]
        full["Filter_no"] = info["Filter_no"]
        full["text"] = ann["caption"]
        gt_by_chart.setdefault(id, full)
        
    
    # Now iterate through the model outputs and match with ground truth and image info
    full_out = {}
    for output_item in model_output:
        image_id = output_item["image_id"]
        
        # Optionally, if your model output's "rec_texts" is a string representation of a list,
        # convert it to an actual list.
        rec_texts_str = output_item.get("rec_texts", "")
        try:
            rec_texts = ast.literal_eval(rec_texts_str)
        except Exception:
            rec_texts = rec_texts_str
        output_item["rec_texts"] = rec_texts

        # Retrieve corresponding ground truth annotations (if any)
        gt_matches = gt_by_chart.get(image_id, {})
        chart_no, reso, row_no = Path(gt_matches.get("file_name", " _ _ _")).stem.split("_")
        full_out[image_id] = gt_matches
        full_out[image_id]["rec_texts"] = rec_texts
        full_out[image_id]["chart_no"] = chart_no
        full_out[image_id]["reso"] = reso
        full_out[image_id]["row_no"] = row_no
        
        
    df = pd.DataFrame(full_out.values())
    df = pd.merge(df, filter_df, on='Filter_no', how='inner')
    df = df.loc[df['row_no'].astype(int) <= 14, :]

    df[['match','missing','missing_char']] = df.apply(lambda row: count_matches_per_row(row), axis=1).apply(pd.Series)
    df['missing_clipped'] = df['missing'].clip(upper=10)

    group_sum = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
    group_sum.columns = ['chart_no', 'a', 'b', 'VA', 'CS', 'Cond','acuity']
    group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()
    group_sum.columns = ['a', 'b', 'VA', 'CS', 'Cond',model_name]
    result_li.append(group_sum)

    # BA-test
    df_ba = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
    df_ba.columns = ['chart_no', 'a', 'b', 'VA', 'CS', 'Cond','acuity']
    df_ba = pd.merge(df_ba,df_human, on=['a', 'b'],how='left')
    results = []

    for cond, group in df_ba.groupby(['a', 'b', 'VA', 'CS', 'Cond']):
        diff = (group['acuity'] - group['human'])
        avg = (group['acuity'] + group['human'])/2
        bias = diff.mean()
        loa_low = bias - 1.96 * diff.std()
        loa_high = bias + 1.96 * diff.std()
            
        results.append({
                'Expected': cond[2], # VA
                'Cond': cond[4],
                'Model': model_name,
                'Bias': bias,
                'Diff': diff,
                'Avg': avg,
                'LoA_Low': loa_low,
                'LoA_High': loa_high
            })
    ba_li.append(pd.DataFrame(results))


  group_sum = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
  group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()
  df_ba = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
  group_sum = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
  group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()
  df_ba = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
  group_sum = df.copy().groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*penalty).reset_index()
  group_sum = group_sum.groupby(['a', 'b

In [6]:
from functools import reduce
summary_dir = Path("../../data/summary")
# reduce(lambda left, right: pd.merge(left, right, on=['a', 'b', 'VA', 'CS', 'Cond']), result_li).to_csv(summary_dir/'mnread.csv', index=False)

In [7]:
import pandas as pd
from functools import reduce
df1 = reduce(lambda left, right: pd.merge(left, right, on=['a', 'b', 'VA', 'CS', 'Cond']), result_li)
df2 = pd.read_csv(summary_dir/'MNREADSummary_0513.csv')
df_human = pd.read_csv('../../data/human/human_mnread_acuity.csv')
df_human = df_human.rename(columns={'acuity': 'human'})

In [8]:
final_df = pd.merge(pd.merge(df1, df_human, on=['a', 'b'],how='left'), df2, on=['a', 'b', 'VA','CS', 'Cond'], how='left')
models = ['human','gpt4o', 'gpt4o_mini', 'gemini_15_flash',
       'gemini_15_pro', 'gemini_2_flash', 'claude3_7_sonnet', 'claude3_5_haiku','cogvlm', 'Qwen2.5-VL-3B-Instruct',
       'Qwen2.5-VL-7B-Instruct', 'Qwen2.5-VL-32B-Instruct', 'google' ,'spts', 'maerec',
       'gemini-2.5-flash', 'gemini-2.5-pro', 'gpt-5-mini', 'gpt-5', 'DeepSeek-OCR_Small',
       'DeepSeek-OCR_Base', 'DeepSeek-OCR_Tiny', 'DeepSeek-OCR_Large', 'DeepSeek-OCR_Gundam']#'qwen', 'maerec','spts', 'ppocr', 'azure', 'google']
final_df['Expected'] = final_df['VA']
final_df = final_df[['a', 'b', 'VA', 'CS', 'Cond', 'Expected'] + models]
# final_df.to_csv(summary_dir/'mnread_combined_1021.csv', index=False)

# Figure

In [10]:
import pandas as pd
final_df_fig = pd.read_csv('../../data/summary/mnread_combined_1021.csv')#final_df.copy()
models = ['human','SeeingAI', 'gpt4o', 'gpt4o_mini', 'gemini_15_flash',
       'gemini_15_pro', 'gemini_2_flash', 'claude3_7_sonnet', 'claude3_5_haiku',
       'cogvlm','Qwen2.5-VL-3B-Instruct', 'Qwen2.5-VL-7B-Instruct', 'Qwen2.5-VL-32B-Instruct',
        'maerec','spts', 'google',  'gemini-2.5-flash','gemini-2.5-pro', 'gpt-5-mini', 'gpt-5', 'DeepSeek-OCR_Small',
       'DeepSeek-OCR_Base', 'DeepSeek-OCR_Tiny', 'DeepSeek-OCR_Large',
       'DeepSeek-OCR_Gundam'] #'ppocr', 'azure',

final_df_fig[['Expected'] + models] =  final_df_fig[['Expected'] + models]  - final_df_fig.loc[15,['Expected'] + models]
final_df_fig.loc[final_df_fig['Cond']=='Original', 'Cond'] = 'Combined'

model_name_dict = {'human':'Human','SeeingAI':'SeeingAI', 'gpt4o':'GPT4O', 'gpt4o_mini':'GPT4O Mini', 
                   'gemini_15_flash':'Gemini-1.5 Flash','gemini_15_pro':'Gemini-1.5 Pro', 'gemini_2_flash':'Gemini-2 Flash', 
                   'claude3_7_sonnet':'Claude3.7 Sonnet', 'claude3_5_haiku':'Claude3.5 Haiku','cogvlm':'CogAgent',
                   'Qwen2.5-VL-3B-Instruct':'Qwen2.5-VL-3B', 'Qwen2.5-VL-7B-Instruct':'Qwen2.5-VL-7B', 'Qwen2.5-VL-32B-Instruct':'Qwen2.5-VL-32B',
                   'maerec':'DBNet++ & MaeRec','spts':'SPTS v2', 'ppocr':'PPOCR', 'azure':'Azure', 'google':'Google Vision',
                   'gemini-2.5-flash':'Gemini-2.5 Flash','gemini-2.5-pro':'Gemini-2.5 Pro', 'gpt-5-mini':'GPT-5 Mini', 'gpt-5':'GPT-5', 'DeepSeek-OCR_Small':'DeepSeek-OCR Small',
                   'DeepSeek-OCR_Base':'DeepSeek-OCR Base', 'DeepSeek-OCR_Tiny':'DeepSeek-OCR Tiny', 'DeepSeek-OCR_Large':'DeepSeek-OCR Large',
                   'DeepSeek-OCR_Gundam':'DeepSeek-OCR Gundam'}
final_df_fig.rename(columns=model_name_dict, inplace=True)
models = [model_name_dict[i] for i in models if i in model_name_dict]

# Model vs Expected

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math
import numpy as np

# Define a function to choose marker based on condition
def get_marker(cond):
    if cond == 'Horizontal':
        return 'o'  # hollow square
    elif cond == 'Vertical':
        return '^'  # hollow triangle
    else:
        return 's'  # hollow circle

# Create a color palette for unique conditions
unique_conditions = final_df_fig['Cond'].unique()
# palette = sns.color_palette("viridis", len(unique_conditions))
# color_mapping = {cond: palette[i] for i, cond in enumerate(unique_conditions)}
colors = ['orange', 'black', '#6A9C89', 'black']
# Create the mapping
color_mapping = {cond: color for cond, color in zip(unique_conditions, colors)}

# Define a function to plot a scatter plot on a given axis for a specified model column
def plot_scatter(ax, model_col, df):
    # Add a dashed diagonal line that spans the range of the data
    x_vals = df['Expected']
    y_vals = df[model_col]
    overall_min = min(x_vals.min(), y_vals.min())
    overall_max = 2.0 #max(x_vals.max(), y_vals.max())
    ax.plot([overall_min, overall_max], [overall_min, overall_max],
            linestyle='--', color='grey', linewidth=2.5)
    x = np.linspace(overall_min, overall_max, 100)
    # Fill the area between y = x - 0.2 and y = x + 0.2
    ax.fill_between(x, x - 0.2, x + 0.2, color='grey', alpha=0.3)
    
    # Plot each group (by Cond) separately to assign markers and colors
    for cond, group in df.groupby('Cond'):
        marker = get_marker(cond)
        ax.scatter(
            group['Expected'],
            group[model_col],
            label=cond,
            marker=marker,
            facecolors=color_mapping[cond],  # makes marker hollow
            edgecolors=color_mapping[cond],
            s=100  # marker size
        )
    
    # calulate the correlation
    from scipy.stats import linregress
    slope, intercept, r_value, p_value, std_err = linregress(x_vals.astype(float).to_numpy(), y_vals.astype(float).to_numpy())
    r_squared = r_value ** 2
    # Annotate R² and p-value in bottom-right
    ax.text(0.96, 0.02, f'$R^2$ = {r_squared:.2f}, '+ (f'p = {p_value:.3f}' if p_value > 0.001 else 'p < 0.001'),
            ha='right', va='bottom', transform=ax.transAxes,
            fontsize=11)

    
    # Set labels, title and legend
    ax.set_xlabel("Expected Acuity Change (logMAR)")
    ax.set_ylabel("Measured Acuity Change (logMAR)")
    ax.set_xlim(-0.1, 2.0)
    ax.set_ylim(-0.1, 2.0)
    ax.set_title(f"{model_col}")
    # ax.legend()

# List of models to plot (include 'human' and other models)
# models = ['human','SeeingAI' ,'gpt4o', 'gpt4o_mini', 'gemini_15_flash',
#        'gemini_15_pro', 'gemini_2_flash', 'claude3_7_sonnet', 'claude3_5_haiku',
#        'qwen', 'maerec','spts', 'ppocr', 'azure', 'google']

# Determine subplot grid layout; here we choose 2 columns
num_models = len(models)
ncols = 5
nrows = math.ceil(num_models / ncols)
nrows = math.ceil(num_models / ncols)

fig, axes = plt.subplots(nrows, ncols, figsize=(3 * ncols, 3 * nrows))
# If there's only one subplot, axes is not an array; make it a list for consistency.
if num_models == 1:
    axes = [axes]
else:
    axes = axes.flatten()

# Plot each model in its corresponding subplot
for i, model in enumerate(models):
    plot_scatter(axes[i], model, final_df_fig)

# Remove any extra subplots if the grid has more axes than models
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])
    
handles, labels = axes.flatten()[0].get_legend_handles_labels()

# Place a common legend outside the figure. Here, loc='upper center' puts it at the top,
# and bbox_to_anchor adjusts its position. Adjust ncol for number of columns in legend.
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fontsize=14)
plt.tight_layout()

plt.savefig('../../figure/mnread.pdf', dpi=300)

In [24]:
final_df_fig.loc[final_df_fig['Expected'] < 1.21, :]

Unnamed: 0,a,b,VA,CS,Cond,Expected,Human,GPT4O,GPT4O Mini,Gemini-1.5 Flash,...,Claude3.7 Sonnet,Claude3.5 Haiku,CogAgent,Qwen2.5-VL-3B,Qwen2.5-VL-7B,Qwen2.5-VL-32B,Google Vision,SPTS v2,DBNet++ & MaeRec,SeeingAI
6,0.125,1.0,0.66,1.96,Horizontal,0.9,0.57455,0.8525,0.805,0.9725,...,1.0225,1.0225,0.794167,0.975,0.966667,0.943333,1.100833,1.2125,0.8775,0.758333
7,0.157,0.157,0.66,1.33,Combined,0.9,0.746138,0.8825,0.865,1.0425,...,1.0925,1.1725,0.6975,1.055,1.01,1.0,1.2375,1.3325,1.2975,0.795
8,0.25,1.0,0.36,2.05,Horizontal,0.6,0.292407,0.48,0.4925,0.6425,...,0.655,0.6375,0.48,0.575,0.6025,0.5,0.665,0.9125,0.57,0.425
9,0.288,0.288,0.36,1.59,Combined,0.6,0.402249,0.4825,0.475,0.6625,...,0.8025,0.7425,0.3275,0.615,0.58,0.56,0.7575,0.9625,0.5875,0.495
10,1.0,0.011,0.13,0.2,Vertical,0.37,0.684193,0.115833,0.078333,0.279167,...,1.289167,1.269167,0.2675,0.748333,0.436667,0.44,1.310833,0.689167,1.150833,0.448333
11,1.0,0.022,0.03,0.47,Vertical,0.27,0.362765,0.0225,0.005,0.115833,...,0.805833,1.1425,0.184167,0.468333,0.196667,0.233333,0.160833,0.3825,0.4375,0.315
12,1.0,0.045,-0.04,0.76,Vertical,0.2,0.199193,0.009167,-0.018333,0.0025,...,0.6525,0.705833,-0.125833,0.285,0.08,0.083333,0.0075,0.359167,0.2075,0.211667
13,1.0,0.089,-0.1,1.06,Vertical,0.14,0.09455,-0.000833,-0.028333,0.005833,...,0.2725,0.3125,0.020833,0.128333,0.033333,0.036667,-0.0125,0.1925,0.040833,0.068333
14,1.0,0.355,-0.19,1.65,Vertical,0.05,0.010265,-0.004167,-0.031667,-0.000833,...,0.029167,0.0425,-0.0325,0.008333,-0.003333,-0.013333,-0.0125,0.035833,-0.069167,-0.011667
15,1.0,1.0,-0.24,2.13,Combined,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# RMSE

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

def rmse(arr):
    return np.sqrt(np.mean(arr ** 2))

# --- metrics ---
rmse_vs_exp = final_df_fig[models].sub(final_df_fig['Expected'], axis=0).apply(rmse, axis=0)
models_wo_human = [m for m in models if str(m).casefold() != 'human']
rmse_to_human = final_df_fig[models_wo_human].sub(final_df_fig['Human'], axis=0).apply(rmse, axis=0)

# --- order & align (ascending by left) ---
order = rmse_vs_exp.sort_values(ascending=True).index.tolist()
left_vals  = rmse_vs_exp.reindex(order).values
right_vals = rmse_to_human.reindex(order).values

y = np.arange(len(order))
bar_h = 0.58

# --- colors & highlights ---
min_left_label  = order[np.nanargmin(left_vals)]
min_right_label = rmse_to_human.idxmin() if len(rmse_to_human) else None
left_colors  = ['#D76C82' if lab == min_left_label else '#9AA0A6' for lab in order]
right_colors = []
for lab, v in zip(order, right_vals):
    if np.isnan(v) or str(lab).casefold() == 'human':
        right_colors.append('none')
    elif lab == min_right_label:
        right_colors.append('#2E7D32')
    else:
        right_colors.append('#C2A68C')

# --- axis range & central gap ---
lim_left  = float(np.nanmax(left_vals))
lim_right = float(np.nanmax(right_vals[~np.isnan(right_vals)]) if np.any(~np.isnan(right_vals)) else 0.0)
lim = max(lim_left, lim_right, 1.0)
step = 0.25
max_tick = np.ceil(lim / step) * step

gap = max_tick * 0.22   # 中缝宽度的一半（可调：0.12~0.25）
left_anchor  = -gap      # 左柱起点在 -gap 处向左延伸
right_anchor =  gap      # 右柱起点在 +gap 处向右延伸

fig_h = max(6, 0.45 * len(order) + 2)
fig, ax = plt.subplots(figsize=(12, fig_h))

# 背景/网格
ax.set_facecolor('#FCFCFD')
# ax.grid(axis='x', linestyle='--', linewidth=0.6, alpha=0.5)
for s in ['top', 'right', 'left']: ax.spines[s].set_visible(False)

# --- bars (外移) ---
# 左：从 -gap 向左画，width 用正值，left=-(gap+value)
left_bars = ax.barh(
    y, left_vals, height=bar_h, color=left_colors, edgecolor='none',
    left=-(gap + left_vals), zorder=3
)
# Human 左侧空心
for patch, lab in zip(left_bars, order):
    if str(lab).casefold() == 'human':
        patch.set_facecolor('none'); patch.set_edgecolor('#D76C82')
        patch.set_linestyle('--'); patch.set_linewidth(1.8)

# 右：从 +gap 向右画
right_bars = ax.barh(
    y, np.nan_to_num(right_vals, nan=0.0), height=bar_h,
    color=right_colors, edgecolor='none', left=right_anchor, zorder=3
)
# 隐藏 Human/NaN 右半
for patch, lab, v in zip(right_bars, order, right_vals):
    if np.isnan(v) or str(lab).casefold() == 'human':
        patch.set_visible(False)

# --- 中缝与标题 ---
ax.axvspan(-gap, gap, color='white', alpha=0.95, zorder=1)  # 中间留白带
ax.text(-gap, 1.012, 'vs Expected',            transform=ax.get_xaxis_transform(), ha='right', va='bottom',
        fontsize=12, fontweight='bold', color='#3A3F44')
ax.text( gap, 1.012, 'vs Human',   transform=ax.get_xaxis_transform(), ha='left',  va='bottom',
        fontsize=12, fontweight='bold', color='#3A3F44')

# --- model names 放在中缝里（不与柱子重合） ---
for yi, lab in zip(y, order):
    ax.text(0, yi, str(lab), ha='center', va='center', fontsize=11.2, color='#28323C', zorder=4)

# --- x 轴：对称范围 + 绝对值刻度 ---
xmax = max_tick + gap + max_tick*0.05
ax.set_xlim(-xmax, xmax)
ax.xaxis.set_major_locator(MultipleLocator(step))
ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.set_xticklabels([t.get_text().lstrip('-') for t in ax.get_xticklabels()], fontsize=13)
ax.set_xlabel('Root Mean Square Error', fontsize=15.5, color='#3A3F44')

# 移除 ytick 文本（我们用中缝标签）
ax.set_yticks(y); ax.set_yticklabels([]); ax.tick_params(axis='y', length=0)

# --- 值标签（条末端） ---
pad = max_tick * 0.015
for v, b in zip(left_vals, left_bars):
    if v <= 0: continue
    x = -gap - v - pad
    ax.text(x, b.get_y()+b.get_height()/2, f'{v:.2f}'.rstrip('0').rstrip('.'),
            ha='right', va='center', fontsize=12, color='#4A4F55', zorder=4)
for v, b in zip(right_vals, right_bars):
    if np.isnan(v) or v <= 0: continue
    x =  gap + v + pad
    ax.text(x, b.get_y()+b.get_height()/2, f'{v:.2f}'.rstrip('0').rstrip('.'),
            ha='left', va='center', fontsize=12, color='#4A4F55', zorder=4)

# --- 图例 ---
legend_handles = [
    Patch(facecolor='#D76C82', edgecolor='none', label='Min RMSE (vs Expected)'),
    Patch(facecolor='#2E7D32', edgecolor='none', label='Min RMSE (vs Human)'),
    # Patch(facecolor='#9AA0A6', edgecolor='none', label='Others (left)'),
    # Patch(facecolor='#C2A68C', edgecolor='none', label='Others (right)'),
    Patch(facecolor='none', edgecolor='#D76C82', linestyle='--', label='Human'),
]
ax.legend(handles=legend_handles, frameon=False, fontsize=12.5, ncol=3, loc='upper center')

plt.tight_layout()
plt.savefig('../../figure/mnread_rmse_stacked.pdf', dpi=300)
# plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math
import numpy as np

def rmse(arr):
    return np.sqrt(np.mean(arr ** 2))
std_values = final_df_fig.loc[final_df_fig['Expected'] < 1.21, models].sub(final_df_fig['Expected'], axis=0).apply(rmse, axis=0)
# Identify the model with the smallest std
min_model = std_values.idxmin()

# Create a color list: red for the model with the smallest std; use a nice color (e.g. 'skyblue') for others.
colors = ['#D76C82' if model == min_model else 'gray' for model in std_values.index]

# Optional: sort the values for a prettier display (and sort colors accordingly)
std_values_sorted = std_values.sort_values(ascending=False)
colors_sorted = [colors[std_values.index.get_loc(model)] for model in std_values_sorted.index]
# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
std_values_sorted.plot(kind='bar', color=colors_sorted, edgecolor='none', width=0.9, ax=ax)

for patch, tick in zip(ax.patches, ax.get_xticklabels()):
    if tick.get_text().lower() == 'human':  # adjust case if needed
        # Set no fill (hollow)
        patch.set_facecolor('none')
        # Set dashed edge
        patch.set_linestyle('--')
        patch.set_linewidth(2)
        # Optionally, set a specific edge color. For instance, if you want red:
        patch.set_edgecolor('#D76C82')
# Customize axis labels, title, and tick parameters
# plt.xlabel("Model", fontsize=14, fontweight='bold')
plt.ylabel("Root Mean Square Error", fontsize=14)
plt.yticks(np.arange(0,1.01,0.25),fontsize=12)
plt.xticks(fontsize=12)
plt.tight_layout()
plt.savefig('../../figure/mnread_rmse.pdf', dpi=300)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math
import numpy as np

def rmse(arr):
    return np.sqrt(np.mean(arr ** 2))
std_values = final_df_fig.loc[final_df_fig['Expected'] < 1.21, models[1:]].sub(final_df_fig['Human'], axis=0).apply(rmse, axis=0)
# Identify the model with the smallest std
min_model = std_values.idxmin()

# Create a color list: red for the model with the smallest std; use a nice color (e.g. 'skyblue') for others.
colors = ['green' if model == min_model else 'gray' for model in std_values.index]

# Optional: sort the values for a prettier display (and sort colors accordingly)
std_values_sorted = std_values.sort_values(ascending=False)
colors_sorted = [colors[std_values.index.get_loc(model)] for model in std_values_sorted.index]
# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
std_values_sorted.plot(kind='bar', color=colors_sorted, edgecolor='none', width=0.9, ax=ax)


# Customize axis labels, title, and tick parameters
# plt.xlabel("Model", fontsize=14, fontweight='bold')
plt.ylabel("Root Mean Square Error", fontsize=14)
plt.yticks(np.arange(0,1.01,0.25),fontsize=12)
plt.xticks(fontsize=12)
plt.tight_layout()
plt.savefig('../../figure/mnread_rmse_tohuman.pdf', dpi=300)


# BA

In [31]:
# def make_long_ba_df(ba_df_wide):
#     records = []
#     for _, row in ba_df_wide.iterrows():
#         diff = np.asarray(row["Diff"]).ravel()
#         avg  = np.asarray(row["Avg"]).ravel()
#         L = min(len(diff), len(avg))
#         if L == 0:
#             continue
#         # 逐样本展开
#         rec = pd.DataFrame({
#             "Model":    row["Model"],
#             "Cond":     row["Cond"],
#             "Expected_logMAR": row["Expected_logMAR"],
#             "Expected": avg[:L].astype(float),
#             "Bias":     diff[:L].astype(float),
#         })
#         records.append(rec)

#     long_df = pd.concat(records, ignore_index=True).dropna(subset=["Expected","Bias"])

#     # 每 (Model, Cond) 计算 BA 统计并回填
#     stats = (
#         long_df.groupby(["Model","Cond","Expected_logMAR"])["Bias"]
#         .agg(n="size", mean="mean", std=lambda s: np.nanstd(s, ddof=1))
#         .reset_index()
#         .rename(columns={"mean":"mean_bias","std":"std_bias"})
#     )
#     stats["LoA_Low"]  = stats["mean_bias"] - 1.96 * stats["std_bias"]
#     stats["LoA_High"] = stats["mean_bias"] + 1.96 * stats["std_bias"]
#     stats["LoA_Width"] = stats["LoA_High"] - stats["LoA_Low"]
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math
import numpy as np
def make_long_ba_df(ba_df_wide):
    records = []
    for i, row in ba_df_wide.iterrows():
        diff = np.asarray(row["Diff"]).ravel()
        avg  = np.asarray(row["Avg"]).ravel()
        L = min(len(diff), len(avg))
        if L == 0:
            continue
        mean_avg = avg.mean().astype(float)
        mean_diff = diff.mean().astype(float)
        # 逐样本展开
        rec = pd.DataFrame(data={
            "Model":    row["Model"],
            "Cond":     row["Cond"],
            "Expected_logMAR": row["Expected_logMAR"],
            "Expected": mean_avg,
            "Bias":     mean_diff,
            "LoA_Low":  mean_diff - 1.96 * np.std(diff, ddof=1),
            "LoA_High": mean_diff + 1.96 * np.std(diff, ddof=1),
            "LoA_Width": 2 * 1.96 * np.std(diff, ddof=1),
            "n": L
        },index=[i])
        records.append(rec)

    long_df = pd.concat(records, ignore_index=True).dropna(subset=["Expected","Bias"])

    # 每 (Model, Cond) 计算 BA 统计并回填
    # stats = (
    #     long_df.groupby(["Model","Cond","Expected_logMAR"])["Bias"]
    #     .agg(n="size", mean="mean", std=lambda s: np.nanstd(s, ddof=1))
    #     .reset_index()
    #     .rename(columns={"mean":"mean_bias","std":"std_bias"})
    # )
    # stats["LoA_Low"]  = stats["mean_bias"] - 1.96 * stats["std_bias"]
    # stats["LoA_High"] = stats["mean_bias"] + 1.96 * stats["std_bias"]
    # stats["LoA_Width"] = stats["LoA_High"] - stats["LoA_Low"]
    # out = long_df.merge(stats[["Model","Cond","Expected_logMAR","LoA_Low","LoA_High","LoA_Width"]], on=["Model","Cond","Expected_logMAR"], how="left")
    return long_df
ba_df = pd.concat(ba_li, axis=0)
# ba_df = ba_df.loc[ba_df['Expected'] < 1.21, :]
ba_df['Expected'] = ba_df['Expected'] - (-0.24)
ba_df['Expected_logMAR'] = ba_df['Expected'] - 0.24
ba_df.drop(columns=['Expected'], inplace=True)

ba_df["LoA_Width"] = ba_df["LoA_High"] - ba_df["LoA_Low"]
ba_df['Model'] = ba_df['Model'].map(model_name_dict).fillna(ba_df['Model'])
ba_df = pd.merge(ba_df.groupby(['Model','Cond','Expected_logMAR'])['Diff'].apply(lambda x: np.concatenate(x.dropna().values,axis=0)).reset_index(), 
ba_df.groupby(['Model','Cond','Expected_logMAR'])['Avg'].apply(lambda x: np.concatenate(x.dropna().values,axis=0)).reset_index(), on=['Model','Cond','Expected_logMAR'])
# ========= 展开并计算 LoA =========
ba_df = make_long_ba_df(ba_df)   # 注意：这里覆盖变量名，后续绘图直接用 ba_df


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FormatStrFormatter

# =====================================================
# 0) 将你的 ba_df(含 list) 展开为逐样本，同时计算 LoA
#     输入列:  Model, Cond, Diff(list), Avg(list)
#     输出列:  Model, Cond, Expected, Bias, LoA_Low, LoA_High, LoA_Width
# =====================================================




# =====================================================
# 1) 你的 marker / 颜色 映射（原样保留）
# =====================================================
def get_marker(cond):
    if cond == 'Horizontal':
        return 'o'  # hollow square (注释来自你原代码，这里沿用; 实际是圆点标记)
    elif cond == 'Vertical':
        return '^'  # hollow triangle
    elif cond == 'Original':
        return 'X'  # 
    else:
        return 's'  # hollow circle

# models = ba_df['Model'].unique()

unique_conditions = ba_df['Cond'].unique()
colors = ['black', 'orange', 'red', '#6A9C89']  # 你给定的颜色序列
# 若条件数超过颜色数，做一个稳妥的扩展
if len(unique_conditions) > len(colors):
    extra = sns.color_palette("tab10", len(unique_conditions) - len(colors))
    colors = list(colors) + list(extra)
color_mapping = {cond: color for cond, color in zip(unique_conditions, colors)}

# =====================================================
# 2) 数据准备（与你的代码一致）
# =====================================================
try:
    ba_df["_ExpectedNum_"] = pd.to_numeric(ba_df["Expected"], errors="raise")
    x_col = "_ExpectedNum_"
    x_tick_labels_from = "Expected"
except Exception:
    x_col = "Expected"
    x_tick_labels_from = "Expected"

cond_list = list(ba_df["Cond"].dropna().unique())
palette = color_mapping

sns.set_theme(style="whitegrid", context="talk")
plt.rcParams["axes.spines.top"] = False
plt.rcParams["axes.spines.right"] = False

# =====================================================
# 3) FacetGrid（与你的结构一致；仅把 marker 接入）
# =====================================================
g = sns.FacetGrid(
    ba_df.sort_values(["Model", x_col]),
    col="Model",
    col_wrap=5,
    height=4.0,
    sharey=True,
    margin_titles=True,
)

def plot_ba_by_cond(data, color=None, **kws):
    ax = plt.gca()
    data = data.sort_values([x_col, "Cond"])
    x_min = min(np.nanmin(data[x_col].to_numpy()), -0.24)
    x_max = 1.21
    mu = float(np.nanmean(data.loc[data['Expected_logMAR']<1.21, "Bias"]))
    sd = float(np.nanstd(data.loc[data['Expected_logMAR']<1.21, "Bias"], ddof=1)) if data.loc[data['Expected_logMAR']<1.21, "Bias"].size > 1 else 0.0
    lo, hi = mu - 1.96*sd, mu + 1.96*sd
    ax.hlines([mu], x_min, x_max, colors='#C96868', linestyles="-",  linewidth=1.6)
    ax.hlines([lo, hi], x_min, x_max, colors='#C96868', linestyles=":", linewidth=1.2)
    for cond in cond_list:
        d = data[data["Cond"] == cond]
        if d.empty:
            continue
        c = palette[cond]
        m = get_marker(cond)  # <<<<<< 接入你的 marker 规则
        keep_mask = d["Expected_logMAR"] < 1.21
        # # LoA 阴影带（本实现中 LoA 对应 (Model,Cond) 常数，填充为平行带）
        # ax.fill_between(
        #     d[x_col],
        #     d["LoA_Low"],
        #     d["LoA_High"],
        #     alpha=0.14, color=c, linewidth=0
        # )
        # mu = float(np.nanmean(d["Bias"]))
        # sd = float(np.nanstd(d["Bias"], ddof=1)) if d["Bias"].size > 1 else 0.0
        # lo, hi = mu - 1.96*sd, mu + 1.96*sd
        # if cond != 'Original':
        #     ax.hlines([mu], x_min, x_max, colors=c, linestyles="-",  linewidth=1.6)
        #     ax.hlines([lo, hi], x_min, x_max, colors=c, linestyles=":", linewidth=1.2)
        # Bias 折线 + 散点（marker 按照你的规则）
        ax.scatter(d.loc[keep_mask, x_col], d.loc[keep_mask, "Bias"], marker=m, color=c, label=cond, zorder=3, s=38)
        ax.scatter(d.loc[~keep_mask, x_col], d.loc[~keep_mask, "Bias"], marker=m, facecolors="none", edgecolors=c, linewidths=1.6,
                   label=None, zorder=4, s=70)  # 超出 LOA 的点画成空心加粗边
        # ax.scatter(d[x_col], d["Bias"], s=15, color=c, edgecolor="white", linewidth=1., zorder=4, marker=m)



    # y=0 基线
    ax.hlines([0], x_min, x_max, linestyle="--", linewidth=1, color="0.35", zorder=1)

    # ——统计面板（与你的注释逻辑一致）——
    stats = (data.loc[data['Expected_logMAR']<1.21]
             .groupby("Cond")
             .agg(n=("Bias","size"),
                  mean_bias=("Bias","mean"),
                  std_bias=("Bias","std"),
                  mean_width=("LoA_Width","mean"))
             .reset_index())
    stats["se"] = stats["std_bias"] / np.sqrt(stats["n"]).replace(0, np.nan)
    stats["ci95"] = 1.96 * stats["se"]
    # stats = stats.sort_values("mean_bias", key=lambda s: s.abs(), ascending=False)
    stats = stats.sort_values("Cond", ascending=False)

    y_pos = 0.22
    line_height = 0.08
    for _, r in stats.iterrows():
        if r["n"] <= 1:
            continue
        cond = r["Cond"]
        c = palette.get(cond, "black")
        ci_text = f" ±{r['ci95']:.2f}" if np.isfinite(r["ci95"]) else ""
        line = f"{cond}: μBias={r['mean_bias']:.2f}{ci_text}"
        ax.text(
            0.04, y_pos, line,
            transform=ax.transAxes, va="top", ha="left",
            fontsize=12, color=c, 
        )
        y_pos -= line_height

    # 轴与网格
    ax.set_xlabel("Avg Performance")
    ax.set_ylabel("Bias (Model - Human)")
    ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f"))
    ax.set_ylim(-1.5, 1.)
    ax.grid(alpha=0.25)

    # 若 Expected 是数值列，刻度格式化
    if x_col == "_ExpectedNum_":
        ticks = np.linspace(data[x_col].min(), data[x_col].max()+0.1, num=5)
        ax.set_xticks(ticks)
        ax.set_xticklabels([f"{t:.2f}" for t in ticks], rotation=30, ha="right")

g.map_dataframe(plot_ba_by_cond)

g.set_titles(col_template="{col_name}")

# 统一图例
handles, labels = g.axes.flat[0].get_legend_handles_labels()
uniq = dict(zip(labels, handles))
g.fig.legend(
    uniq.values(), uniq.keys(),
    loc="upper center",
    bbox_to_anchor=(0.5, 1.08),
    ncol=min(len(uniq), 5),
    frameon=True,
)

plt.tight_layout()
sns.despine(trim=True)
# plt.show()
plt.savefig('../../figure/mnread_ba.pdf', dpi=300)

# X-Height

In [20]:
import pandas as pd
final_df_fig = pd.read_csv('../../data/summary/mnread_combined_1021.csv')#final_df.copy()
models = ['human', 'Qwen2.5-VL-3B-Instruct', 'Qwen2.5-VL-7B-Instruct', 'Qwen2.5-VL-32B-Instruct',
          'gpt-4o-mini', 'gpt-4o', 'gpt-5', 'gpt-5-mini', 
          'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', 'claude-3-7-sonnet', 'claude-3-5-haiku', 'google','maerec','spts','cogvlm','seeingai',
            'DeepSeek-OCR_Tiny', 'DeepSeek-OCR_Small', 'DeepSeek-OCR_Base', 'DeepSeek-OCR_Large','DeepSeek-OCR_Gundam',
       ]
model_name_dict = {'human':'Human','SeeingAI':'SeeingAI', 'gpt4o':'GPT4o', 'gpt4o_mini':'GPT4o Mini', 'gpt-4o-mini':'GPT4o Mini', 'gpt-4o':'GPT4o', 'gpt-5-mini':'GPT5 Mini', 'gpt-5':'GPT5',
                   'gemini_15_flash':'Gemini-1.5 Flash','gemini_15_pro':'Gemini-1.5 Pro', 'gemini_2_flash':'Gemini-2.0 Flash', 'gemini-2.0-flash':'Gemini-2.0 Flash',
                   'gemini-1.5-pro':'Gemini-1.5 Pro', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'gemini-2.5-pro':'Gemini-2.5 Pro', 'gemini-2.5-flash':'Gemini-2.5 Flash',
                   'claude3_7_sonnet':'Claude3.7 Sonnet', 'claude3_5_haiku':'Claude3.5 Haiku','cogvlm':'CogAgent', 'google': 'Google Vision',
                   'claude-3-7-sonnet':'Claude3.7 Sonnet', 'claude-3-5-haiku':'Claude3.5 Haiku', 'maerec':'DBNet++ & MaeRec',
                   'Qwen2.5-VL-3B-Instruct':'Qwen2.5-VL-3B', 'Qwen2.5-VL-7B-Instruct':'Qwen2.5-VL-7B', 'Qwen2.5-VL-32B-Instruct':'Qwen2.5-VL-32B',
                   'db_maerecB':'DBNet & MAERec-B', 'db_maerecS':'DBNet & MAERec-S', 'dbpp_maerecB':'DBNet++ & MaeRec-B','dbpp_maerecS':'DBNet++ & MaeRec-S',
                   'spts':'SPTS v2', 'ppocr':'PPOCR', 'azure':'Azure', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'google_vision':'Google Vision', 'seeingai':'SeeingAI',
                   'DeepSeek-OCR_Tiny':'DeepSeek-OCR Tiny', 'DeepSeek-OCR_Small':'DeepSeek-OCR Small', 'DeepSeek-OCR_Base':'DeepSeek-OCR Base', 'DeepSeek-OCR_Large':'DeepSeek-OCR Large', 'DeepSeek-OCR_Gundam':'DeepSeek-OCR Gundam',
                   'Qwen2.5-VL-3B-Instruct_text_desc':'Qwen2.5-VL-3B-Instruct (Text)', 'Qwen2.5-VL-7B-Instruct_text_desc':'Qwen2.5-VL-7B-Instruct (Text)', 'Qwen2.5-VL-32B-Instruct_text_desc':'Qwen2.5-VL-32B-Instruct (Text)',
                   'gemini-2.5-flash_text_desc':'Gemini-2.5 Flash (Text)'}
final_df_fig.rename(columns=model_name_dict, inplace=True)
models = [model_name_dict[i] for i in models if i in model_name_dict]
final_df_fig.loc[final_df_fig['Cond']=='Original', models] = final_df_fig.loc[final_df_fig['Cond']=='Original', models].apply(lambda x: 68*10**(x-.9)).round(3)
final_df_fig.loc[final_df_fig['Cond']=='Original', models]

Unnamed: 0,Human,Qwen2.5-VL-3B,Qwen2.5-VL-7B,Qwen2.5-VL-32B,GPT4o Mini,GPT4o,GPT5,GPT5 Mini,Gemini-2.0 Flash,Gemini-2.5 Flash,...,Google Vision,DBNet++ & MaeRec,SPTS v2,CogAgent,SeeingAI,DeepSeek-OCR Tiny,DeepSeek-OCR Small,DeepSeek-OCR Base,DeepSeek-OCR Large,DeepSeek-OCR Gundam
15,5.023,3.758,3.569,3.737,3.487,3.467,3.467,3.913,3.528,3.487,...,5.559,9.226,8.512,7.674,3.883,3.589,3.589,3.631,3.631,3.631


In [11]:
pd.read_csv('../../data/summary/etdrs_combined_1021.csv').columns

Index(['a', 'b', 'VA', 'CS', 'Cond', 'Expected', 'human', 'gpt4o',
       'gpt4o_mini', 'gemini_15_flash', 'gemini_15_pro', 'gemini_2_flash',
       'claude3_7_sonnet', 'claude3_5_haiku', 'cogvlm',
       'Qwen2.5-VL-3B-Instruct', 'Qwen2.5-VL-7B-Instruct',
       'Qwen2.5-VL-32B-Instruct', 'qwen', 'maerec', 'spts', 'ppocr', 'azure',
       'google', 'gemini-2.5-flash', 'gemini-2.5-pro', 'gpt-5-mini', 'gpt-5',
       'DeepSeek-OCR_Small', 'DeepSeek-OCR_Base', 'DeepSeek-OCR_Tiny',
       'DeepSeek-OCR_Large', 'DeepSeek-OCR_Gundam', 'SeeingAI'],
      dtype='object')

In [19]:
etdrs =  pd.read_csv('../../data/summary/etdrs_combined_1021.csv')
models = ['human', 'Qwen2.5-VL-3B-Instruct', 'Qwen2.5-VL-7B-Instruct', 'Qwen2.5-VL-32B-Instruct',
          'gpt-4o-mini', 'gpt-4o', 'gpt-5', 'gpt-5-mini', 
          'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', 'claude-3-7-sonnet', 'claude-3-5-haiku', 'google','maerec','spts','cogvlm','seeingai',
            'DeepSeek-OCR_Tiny', 'DeepSeek-OCR_Small', 'DeepSeek-OCR_Base', 'DeepSeek-OCR_Large','DeepSeek-OCR_Gundam',
       ]
model_name_dict = {'human':'Human','SeeingAI':'SeeingAI', 'gpt4o':'GPT4o', 'gpt4o_mini':'GPT4o Mini', 'gpt-4o-mini':'GPT4o Mini', 'gpt-4o':'GPT4o', 'gpt-5-mini':'GPT5 Mini', 'gpt-5':'GPT5',
                   'gemini_15_flash':'Gemini-1.5 Flash','gemini_15_pro':'Gemini-1.5 Pro', 'gemini_2_flash':'Gemini-2.0 Flash', 'gemini-2.0-flash':'Gemini-2.0 Flash',
                   'gemini-1.5-pro':'Gemini-1.5 Pro', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'gemini-2.5-pro':'Gemini-2.5 Pro', 'gemini-2.5-flash':'Gemini-2.5 Flash',
                   'claude3_7_sonnet':'Claude3.7 Sonnet', 'claude3_5_haiku':'Claude3.5 Haiku','cogvlm':'CogAgent', 'google': 'Google Vision',
                   'claude-3-7-sonnet':'Claude3.7 Sonnet', 'claude-3-5-haiku':'Claude3.5 Haiku', 'maerec':'DBNet++ & MaeRec',
                   'Qwen2.5-VL-3B-Instruct':'Qwen2.5-VL-3B', 'Qwen2.5-VL-7B-Instruct':'Qwen2.5-VL-7B', 'Qwen2.5-VL-32B-Instruct':'Qwen2.5-VL-32B',
                   'db_maerecB':'DBNet & MAERec-B', 'db_maerecS':'DBNet & MAERec-S', 'dbpp_maerecB':'DBNet++ & MaeRec-B','dbpp_maerecS':'DBNet++ & MaeRec-S',
                   'spts':'SPTS v2', 'ppocr':'PPOCR', 'azure':'Azure', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'google_vision':'Google Vision', 'seeingai':'SeeingAI',
                   'DeepSeek-OCR_Tiny':'DeepSeek-OCR Tiny', 'DeepSeek-OCR_Small':'DeepSeek-OCR Small', 'DeepSeek-OCR_Base':'DeepSeek-OCR Base', 'DeepSeek-OCR_Large':'DeepSeek-OCR Large', 'DeepSeek-OCR_Gundam':'DeepSeek-OCR Gundam',
                   'Qwen2.5-VL-3B-Instruct_text_desc':'Qwen2.5-VL-3B-Instruct (Text)', 'Qwen2.5-VL-7B-Instruct_text_desc':'Qwen2.5-VL-7B-Instruct (Text)', 'Qwen2.5-VL-32B-Instruct_text_desc':'Qwen2.5-VL-32B-Instruct (Text)',
                   'gemini-2.5-flash_text_desc':'Gemini-2.5 Flash (Text)'}
etdrs.rename(columns=model_name_dict, inplace=True)
models = [model_name_dict[i] for i in models if i in model_name_dict]
etdrs.loc[etdrs['Cond']=='Original', models] = etdrs.loc[etdrs['Cond']=='Original', models].apply(lambda x: 248*10**(x -1.7)).round(3)
etdrs.loc[etdrs['Cond']=='Original', models]


Unnamed: 0,Human,Qwen2.5-VL-3B,Qwen2.5-VL-7B,Qwen2.5-VL-32B,GPT4o Mini,GPT4o,GPT5,GPT5 Mini,Gemini-2.0 Flash,Gemini-2.5 Flash,...,Google Vision,DBNet++ & MaeRec,SPTS v2,CogAgent,SeeingAI,DeepSeek-OCR Tiny,DeepSeek-OCR Small,DeepSeek-OCR Base,DeepSeek-OCR Large,DeepSeek-OCR Gundam
15,4.116,5.103,4.583,4.799,4.244,4.376,4.053,4.244,4.654,4.583,...,5.552,4.513,82.121,7.152,4.116,17.692,26.778,7.375,8.087,8.087


In [18]:
total =  pd.read_csv('../../data/summary/totaltext_combined_251007.csv')
models = ['human', 'Qwen2.5-VL-3B-Instruct', 'Qwen2.5-VL-7B-Instruct', 'Qwen2.5-VL-32B-Instruct',
          'gpt-4o-mini', 'gpt-4o', 'gpt-5', 'gpt-5-mini', 
          'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', 'claude-3-7-sonnet', 'claude-3-5-haiku', 'google_vision','maerec','spts','cogvlm','seeingai',
            'DeepSeek-OCR_Tiny', 'DeepSeek-OCR_Small', 'DeepSeek-OCR_Base', 'DeepSeek-OCR_Large','DeepSeek-OCR_Gundam',
       ]
model_name_dict = {'human':'Human','SeeingAI':'SeeingAI', 'gpt4o':'GPT4o', 'gpt4o_mini':'GPT4o Mini', 'gpt-4o-mini':'GPT4o Mini', 'gpt-4o':'GPT4o', 'gpt-5-mini':'GPT5 Mini', 'gpt-5':'GPT5',
                   'gemini_15_flash':'Gemini-1.5 Flash','gemini_15_pro':'Gemini-1.5 Pro', 'gemini_2_flash':'Gemini-2.0 Flash', 'gemini-2.0-flash':'Gemini-2.0 Flash',
                   'gemini-1.5-pro':'Gemini-1.5 Pro', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'gemini-2.5-pro':'Gemini-2.5 Pro', 'gemini-2.5-flash':'Gemini-2.5 Flash',
                   'claude3_7_sonnet':'Claude3.7 Sonnet', 'claude3_5_haiku':'Claude3.5 Haiku','cogvlm':'CogAgent',
                   'claude-3-7-sonnet':'Claude3.7 Sonnet', 'claude-3-5-haiku':'Claude3.5 Haiku', 'maerec':'DBNet++ & MaeRec',
                   'Qwen2.5-VL-3B-Instruct':'Qwen2.5-VL-3B', 'Qwen2.5-VL-7B-Instruct':'Qwen2.5-VL-7B', 'Qwen2.5-VL-32B-Instruct':'Qwen2.5-VL-32B',
                   'db_maerecB':'DBNet & MAERec-B', 'db_maerecS':'DBNet & MAERec-S', 'dbpp_maerecB':'DBNet++ & MaeRec-B','dbpp_maerecS':'DBNet++ & MaeRec-S',
                   'spts':'SPTS v2', 'ppocr':'PPOCR', 'azure':'Azure', 'gemini-1.5-flash':'Gemini-1.5 Flash', 'google_vision':'Google Vision', 'seeingai':'SeeingAI',
                   'DeepSeek-OCR_Tiny':'DeepSeek-OCR Tiny', 'DeepSeek-OCR_Small':'DeepSeek-OCR Small', 'DeepSeek-OCR_Base':'DeepSeek-OCR Base', 'DeepSeek-OCR_Large':'DeepSeek-OCR Large', 'DeepSeek-OCR_Gundam':'DeepSeek-OCR Gundam',
                   'Qwen2.5-VL-3B-Instruct_text_desc':'Qwen2.5-VL-3B-Instruct (Text)', 'Qwen2.5-VL-7B-Instruct_text_desc':'Qwen2.5-VL-7B-Instruct (Text)', 'Qwen2.5-VL-32B-Instruct_text_desc':'Qwen2.5-VL-32B-Instruct (Text)',
                   'gemini-2.5-flash_text_desc':'Gemini-2.5 Flash (Text)'}
total.rename(columns=model_name_dict, inplace=True)
models = [model_name_dict[i] for i in models if i in model_name_dict]
total.loc[total['Cond']=='Original', models]

Unnamed: 0,Human,Qwen2.5-VL-3B,Qwen2.5-VL-7B,Qwen2.5-VL-32B,GPT4o Mini,GPT4o,GPT5,GPT5 Mini,Gemini-2.0 Flash,Gemini-2.5 Flash,...,Google Vision,DBNet++ & MaeRec,SPTS v2,CogAgent,SeeingAI,DeepSeek-OCR Tiny,DeepSeek-OCR Small,DeepSeek-OCR Base,DeepSeek-OCR Large,DeepSeek-OCR Gundam
15,0.934101,0.915296,0.935455,0.953409,0.941156,0.934877,0.953333,0.916905,0.920358,0.824762,...,0.927681,0.655564,0.900532,0.681925,0.951891,0.784236,0.876908,0.918588,0.902996,0.927857


In [17]:
df = pd.concat([etdrs.loc[etdrs['Cond']=='Original', models],
                final_df_fig.loc[final_df_fig['Cond']=='Original', models],
                total.loc[total['Cond']=='Original', models]], axis=0).T
df.columns = ['ETDRS','MNREAD', 'TOTALTEXT']
df.sort_values(by=['MNREAD'], ascending=True, inplace=True)
## only keep 3 digits
df.round(3)

Unnamed: 0,ETDRS,MNREAD,TOTALTEXT
GPT5,4.053,3.467,0.953
GPT4o,4.376,3.467,0.935
GPT4o Mini,4.244,3.487,0.941
Gemini-2.5 Flash,4.583,3.487,0.825
Gemini-2.5 Pro,6.624,3.487,0.889
Gemini-2.0 Flash,4.654,3.528,0.92
Qwen2.5-VL-7B,4.583,3.569,0.935
DeepSeek-OCR Small,26.778,3.589,0.877
DeepSeek-OCR Tiny,17.692,3.589,0.784
DeepSeek-OCR Large,8.087,3.631,0.903
