# Plotting the results

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go

In [2]:
path = os.path.dirname(os.getcwd())

# Perline & Whole Files (Simple prompt LLMs vs. OCR/HTR)

In [None]:
# Whole Files
bleu_whole_normalized = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_normalized.csv')
bleu_whole_onlystripped = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_onlystripped.csv')
bleu_whole_noheader = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_normalized_noheader.csv')

cer_whole_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_normalized.csv')
cer_whole_onlystripped = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_onlystripped.csv')
cer_whole_noheader = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_normalized_noheader.csv')

In [None]:
# Perline Files
bleu_n4 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n4_normalized.csv')
bleu_n3 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_normalized.csv')
bleu_n2 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n2_normalized.csv')

bleu_n3_onlystripped = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_onlystripped.csv')
bleu_n4_onlystripped = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n4_onlystripped.csv')

In [5]:
cer_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_normalized.csv') #both lower and uncapitalized

bleu_n3_noheader = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_normalized_noheader.csv')
cer_noheader = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_normalized_noheader.csv')

In [None]:
# data = bleu_whole_normalized
# data = cer_whole_normalized
data = cer_normalized
# score = 'bleu'
score = 'cer'

y_axis_name = f'{score.upper()} Score'

llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=data[data['model'] == model]['model'], 
            y=data[data['model'] == model][score],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=data[data['model'] == model]['model'], 
            y=data[data['model'] == model][score],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))

for model in ocr_order:
    fig.add_trace(go.Box(
        x=data[data['model'] == model]['model'], 
        y=data[data['model'] == model][score],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  
    ))

for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=data[data['model'] == model]['model'], 
        y=data[data['model'] == model][score],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  
    ))


ocr_start_index = len(llm_order)  
ocr_ft_start_index = ocr_start_index + len(ocr_order)  
llm_midpoint = ocr_start_index / 2  
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  


fig.update_layout(
    # title='BLEU Scores (N=3)',
    # xaxis_title='Model',
    yaxis_title= y_axis_name,
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    font_color = 'black',
    xaxis=dict(
        categoryorder='array', 
        categoryarray=llm_order + ocr_order + ocr_ft_order 
    ),
    yaxis = dict(range=[-0.5, 2]),
    showlegend=False,
    # legend=dict(
    #             orientation="h",
    #             entrywidth=70,
    #             yanchor="bottom",
    #             y=1.02,
    #             xanchor="right",
    #             x=1),
    shapes=[
        # A vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  
            line=dict(color="black", width=1)
        )
    ],
    annotations=[
        dict(
            x=llm_midpoint, 
            y=1.2,  
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        dict(
            x=ocr_midpoint,  
            y=1.2,  
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)
# fig.update_xaxes(rangeselector_font_size=10)

fig.show()



In [None]:
bleu_n2['source'] = 'N2'
bleu_n3['source'] = 'N3'
bleu_n4['source'] = 'N4'
# bleu_n3_noheader['source'] = 'N3 No Header'


combined_bleu = pd.concat([bleu_n2, bleu_n3, bleu_n4])
# combined_bleu = pd.concat([bleu_n3, bleu_n3_noheader])

n2_color = "rgba(143,206,0, 0.5)"  #green
n3_color = "rgba(67, 162, 202, 0.5)"  # semi-transparent blue
df_color = "rgba(250, 159, 181, 0.3)"  # lighter pink with more transparency


fig = go.Figure()

for model in llm_order + ocr_order + ocr_ft_order:
    for source, color in zip(['N2', 'N3', 'N4'], [n2_color, n3_color, df_color]):
    # for source, color in zip(['N3', 'N3 No Header'], [n3_color, df_color]):
        fig.add_trace(go.Box(
            x=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['model'],
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu'],
            # name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False  
        ))

fig.add_trace(go.Box(
    y=[None],  
    name='N2',
    marker=dict(color=n2_color),
    showlegend=Falase  
))

fig.add_trace(go.Box(
    y=[None],  
    name='N3',
    marker=dict(color=n3_color),
    showlegend=False 
))

fig.add_trace(go.Box(
    y=[None],  
    name='N4',
    marker=dict(color=df_color),
    showlegend=False  
))

fig.update_layout(
    # title='Comparison of BLEU Scores (N=2 vs. N=3 vs. N=4, All normalized)',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    font_color = 'black',
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',
        categoryarray=llm_order + ocr_order + ocr_ft_order
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) - 0.5,
            x1=len(llm_order) - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=2)
        )
    ]
)

fig.show()


# Impact of different normalizations for perline

In [None]:
# bleu_n3 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_onlystripped.csv')
# bleu_n3_lower = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_lowered.csv')
# bleu_n3_unidecoded = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_unidecoded.csv')
# bleu_n3_normalized = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_normalized.csv') #both lower and uncapitalized

# cer_all = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_onlystripped.csv')
# cer_lower = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_lowered.csv')
# cer_unidecoded = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_unidecoded.csv')
# cer_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_normalized.csv') #both lower and uncapitalized

In [20]:
bleu_n3 = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_onlystripped.csv')
bleu_n3_lower = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_lowered.csv')
bleu_n3_unidecoded = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_unidecoded.csv')
bleu_n3_normalized = pd.read_csv(path + '/results/scores_comparisons/bleu_whole-scan_all_n4_normalized.csv') #both lower and uncapitalized

cer_all = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_onlystripped.csv')
cer_lower = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_lowered.csv')
cer_unidecoded = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_unidecoded.csv')
cer_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_whole-scan_all_normalized.csv') #both lower and uncapitalized

In [None]:
bleu_n3['source'] = 'Only Stripped'
bleu_n3_lower['source'] = 'Lowered'
bleu_n3_unidecoded['source'] = 'Unidecoded'
bleu_n3_normalized['source'] = 'Normalized'

combined_bleu = pd.concat([bleu_n3, bleu_n3_lower, bleu_n3_unidecoded, bleu_n3_normalized])


llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR']
ocr_ft_order = ['TrOCR20', 'TrOCR50']
model_order = llm_order + ocr_order + ocr_ft_order


n3_color = "rgba(0,0,0, 0.5)"  # Gray
lower_color = "rgba(143,206,0, 0.5)"  # Green
unide_color = "rgba(67, 162, 202, 0.5)"  # Blue
norm_color = "rgba(250, 159, 181, 0.3)"  # Pink


source_colors = {
    'Only Stripped': n3_color,
    'Lowered': lower_color,
    'Unidecoded': unide_color,
    'Normalized': norm_color
}


fig = go.Figure()

x_positions = []
tick_labels = []

x_index = 0
for model in model_order:
    for source in ['Only Stripped', 'Lowered', 'Unidecoded', 'Normalized']:
        color = source_colors[source]
        # Add box plot trace
        fig.add_trace(go.Box(
            x=[x_index] * len(combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu']),
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu'],
            name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False
        ))
        x_positions.append(x_index)
        x_index += 1
    
    tick_labels.append(model)

for source, color in source_colors.items():
    fig.add_trace(go.Box(
        y=[None],  # No data
        name=source,
        marker=dict(color=color),
        showlegend=False
    ))

# Update layout
fig.update_layout(
    # title='Comparison of BLEU Scores for Different Models (Lowered vs. Unidecoded vs. Normalized)',
    # xaxis_title='Model',
    yaxis_title='BLEU Score',
    plot_bgcolor='rgba(0,0,0,0)',
    font_color = 'black',
    xaxis=dict(
        tickvals=[4 * i + 1.5 for i in range(len(model_order))],  
        ticktext=tick_labels,
        tickangle=45
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) * 4 - 0.5,
            x1=len(llm_order) * 4 - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=1)
        )
    ]
)

fig.show()


In [None]:
cer_all['source'] = 'Only Stripped'
cer_lower['source'] = 'Lowered'
cer_unidecoded['source'] = 'Unidecoded'
cer_normalized['source'] = 'Normalized'

combined_cer = pd.concat([cer_all, cer_lower, cer_unidecoded, cer_normalized])

llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR']
ocr_ft_order = ['TrOCR20', 'TrOCR50']
model_order = llm_order + ocr_order + ocr_ft_order

n3_color = "rgba(0,0,0, 0.5)"  # Gray
lower_color = "rgba(143,206,0, 0.5)"  # Green
unide_color = "rgba(67, 162, 202, 0.5)"  # Blue
norm_color = "rgba(250, 159, 181, 0.3)"  # Pink

source_colors = {
    'Only Stripped': n3_color,
    'Lowered': lower_color,
    'Unidecoded': unide_color,
    'Normalized': norm_color
}


fig = go.Figure()

x_positions = []
tick_labels = []

x_index = 0
for model in model_order:
    for source in ['Only Stripped', 'Lowered', 'Unidecoded', 'Normalized']:
        color = source_colors[source]
        # Add box plot trace
        fig.add_trace(go.Box(
            x=[x_index] * len(combined_cer[(combined_cer['model'] == model) & (combined_cer['source'] == source)]['cer']),
            y=combined_cer[(combined_cer['model'] == model) & (combined_cer['source'] == source)]['cer'],
            name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False
        ))
        x_positions.append(x_index)
        x_index += 1
    
    tick_labels.append(model)

for source, color in source_colors.items():
    fig.add_trace(go.Box(
        y=[None], 
        name=source,
        marker=dict(color=color),
        showlegend=False
    ))


fig.update_layout(
    # title='Comparison of CER Scores for Different Models (Lowered vs. Unidecoded vs. Normalized)',
    # xaxis_title='Model',
    yaxis_title='CER Score',
    # yaxis = dict(range=[-0.5, 2]),
    plot_bgcolor='rgba(0,0,0,0)',
    font_color = 'black',
    xaxis=dict(
        tickvals=[4 * i + 1.5 for i in range(len(model_order))],  # Center the label for each model
        ticktext=tick_labels,
        tickangle=45
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) * 4 - 0.5,
            x1=len(llm_order) * 4 - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=1)
        )
    ]
)

fig.show()


# Header-NoHeader (Whole)

In [None]:

d1 = cer_whole_normalized
d2 = cer_whole_noheader
# bleu_whole_normalized['source'] = 'Header'
# bleu_whole_noheader['source'] = 'No Header'
d1['source'] = 'Header'
d2['source'] = 'No Header'
score= 'cer'

# combined_bleu = pd.concat([bleu_n2, bleu_n3, bleu_n4])
combined_bleu = pd.concat([d1, d2])

n2_color = "rgba(143,206,0, 0.5)"  #green
n3_color = "rgba(67, 162, 202, 0.5)"  # semi-transparent blue
df_color = "rgba(250, 159, 181, 0.3)"  # lighter pink with more transparency


fig = go.Figure()

for model in ['gpt_one_example', 'claude_one_example','gpt_two_example', 'claude_two_example']:
    for source, color in zip(['Header', 'No Header'], [n3_color, df_color]):
        fig.add_trace(go.Box(
            x=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['model'],
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)][score],
            boxmean=True,
            marker=dict(color=color),
            showlegend=False  
        ))


fig.add_trace(go.Box(
    y=[None], 
    name='Header',
    marker=dict(color=n3_color),
    showlegend=False  
))

fig.add_trace(go.Box(
    y=[None], 
    name='No Header',
    marker=dict(color=df_color),
    showlegend=False  
))

fig.update_layout(
    # title='Comparison of BLEU Scores (N=2 vs. N=3 vs. N=4, All normalized)',
    xaxis_title='Model',
    yaxis_title=f'{score.upper()} Score',
    plot_bgcolor='rgba(0,0,0,0)',
    font_size = 20,
    font_color='black',
    # paper_bgcolor='rgba(0,0,0,0)',
    # xaxis=dict(
    #     categoryorder='array',
    #     categoryarray=llm_order + ocr_order + ocr_ft_order
    # ),
    
)

fig.show()


# Aggregated Scores for Perline Files

In [8]:
bleu_n3_filtered = bleu_n3[~bleu_n3['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_bleu = bleu_n3_filtered.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu.columns = ['model', 'mean', 'std']

In [9]:
agg_bleu = agg_bleu.sort_values('mean', ascending=False).reset_index(drop=True)
agg_bleu


Unnamed: 0,model,mean,std
0,gpt_two_example,0.524193,0.378255
1,gpt_one_example,0.517386,0.385146
2,gpt_complex,0.463117,0.367535
3,gpt_refine_complex,0.459769,0.360898
4,claude_complex,0.395144,0.390113
5,claude_refine_complex,0.351223,0.367406
6,claude_two_example,0.348464,0.355716
7,gpt_simple,0.30711,0.295711
8,claude_one_example,0.251642,0.309429
9,claude_simple,0.184442,0.238628


In [11]:
cer_df_filtered = cer_normalized[~cer_normalized['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_cer = cer_df_filtered.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer.columns = ['model', 'mean', 'std']
agg_cer = agg_cer.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer

Unnamed: 0,model,mean,std
0,gpt_one_example,0.207454,0.195313
1,gpt_refine_complex,0.228099,0.18898
2,gpt_complex,0.229064,0.189635
3,claude_complex,0.27885,0.22169
4,gpt_two_example,0.391542,2.644353
5,claude_one_example,0.412111,0.677461
6,gpt_simple,0.413126,0.490681
7,claude_simple,0.498566,0.445572
8,claude_refine_complex,0.575715,3.80459
9,TrOCR50,0.610648,0.829771


# T-test (Comparisons between the models)

In [17]:
import scipy.stats as stats
import itertools

In [24]:
def ttest(score_df, models, score = 'bleu', ind = 'id'):

    paired_t = []

    for model1, model2 in itertools.pairwise(models): #if you want all pairs with respect to the order
    # for model1, model2 in itertools.combinations(models, 2): #if you want all pairs
        t1 = score_df[score_df['model'] == model1].set_index(ind)[score]
        t2 = score_df[score_df['model'] == model2].set_index(ind)[score]

        # Because the outputs of the 1 or 2 example prompts have different lengths as the others.
        paired_scores = t1.align(t2, join='inner')
        
        t_stat, p_value = stats.ttest_rel(paired_scores[0], paired_scores[1])
        paired_t.append({'Model 1': model1, 'Model 2': model2, 't-statistic': t_stat, 'p-value': p_value})

        paired_t_df = pd.DataFrame(paired_t)
        paired_t_df['significance'] = paired_t_df['p-value'].apply(lambda x: 'Significant' if x < 0.05 else 'Not Significant')

    return paired_t_df

In [19]:
models_bleu = agg_bleu['model'].unique() # Already sorted by mean
models_cer = agg_cer['model'].unique() # Already sorted by mean

In [25]:
paired_t_bleu = ttest(bleu_n3, models_bleu, score = 'bleu', ind='id')
paired_t_cer = ttest(cer_normalized, models_cer, score = 'cer', ind='id')

In [26]:
paired_t_bleu

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_two_example,gpt_one_example,1.290513,0.1979373,Not Significant
1,gpt_one_example,gpt_complex,3.841105,0.0001514299,Significant
2,gpt_complex,gpt_refine_complex,0.664388,0.5069849,Not Significant
3,gpt_refine_complex,claude_complex,4.221433,3.277621e-05,Significant
4,claude_complex,claude_refine_complex,5.089663,6.562659e-07,Significant
5,claude_refine_complex,claude_two_example,0.074633,0.9405597,Not Significant
6,claude_two_example,gpt_simple,2.139563,0.03325459,Significant
7,gpt_simple,claude_one_example,2.487401,0.01344822,Significant
8,claude_one_example,claude_simple,4.092143,5.589564e-05,Significant
9,claude_simple,TrOCR50,3.189761,0.001784786,Significant


In [27]:
paired_t_cer

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_one_example,gpt_refine_complex,-3.582563,0.0004008886,Significant
1,gpt_refine_complex,gpt_complex,-0.384524,0.70088,Not Significant
2,gpt_complex,claude_complex,-6.542774,2.836021e-10,Significant
3,claude_complex,gpt_two_example,-0.710113,0.4782251,Not Significant
4,gpt_two_example,claude_one_example,-0.173735,0.8621991,Not Significant
5,claude_one_example,gpt_simple,-0.038572,0.9692593,Not Significant
6,gpt_simple,claude_simple,-3.969554,9.143419e-05,Significant
7,claude_simple,claude_refine_complex,-0.338702,0.7350861,Not Significant
8,claude_refine_complex,TrOCR50,-3.371011,0.0009865257,Significant
9,TrOCR50,claude_two_example,-0.034621,0.9724353,Not Significant


In [72]:
# sig = paired_t_df[paired_t_df['p-value'] < 0.05].sort_values('p-value') 
# paired t-test is based on: Model1 - Model2 
# if Model1 - Model2 > 0, t is larger, p-value is small.

In [73]:
# pd.crosstab(sig['Model 1'], sig['Model 2'])

Model 2,TrOCR,TrOCR20,claude_refine,claude_two_example
Model 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude_complex,0,0,0,1
claude_simple,0,1,0,0
gpt_simple,0,0,1,0
pytesseractOCR,1,0,0,0


## Whole Scans

In [16]:
bleu_df_whole_filtered = bleu_whole_normalized[~bleu_whole_normalized['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_bleu_whole = bleu_df_whole_filtered.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu_whole.columns = ['model', 'mean', 'std']
agg_bleu_whole = agg_bleu_whole.sort_values('mean', ascending=False).reset_index(drop=True) #ascending=True because lower bleu means better transcribed.
agg_bleu_whole

Unnamed: 0,model,mean,std
0,gpt_complex,0.254013,0.096907
1,claude_complex,0.253398,0.062715
2,gpt_refine_complex,0.25034,0.120085
3,gpt_two_example,0.222694,0.066262
4,gpt_one_example,0.209717,0.099903
5,claude_refine_complex,0.118911,0.104991
6,Pytesseract,0.070828,0.017068
7,claude_one_example,0.045468,0.03665
8,claude_two_example,0.038366,0.024823
9,claude_simple,0.0293,0.025115


In [17]:
cer_df_whole_filtered = cer_whole_normalized[~cer_whole_normalized['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_cer_whole = cer_df_whole_filtered.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer_whole.columns = ['model', 'mean', 'std']
agg_cer_whole = agg_cer_whole.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer_whole

Unnamed: 0,model,mean,std
0,claude_complex,0.614534,0.033108
1,gpt_refine_complex,0.689521,0.092484
2,gpt_complex,0.69252,0.09113
3,gpt_one_example,0.709355,0.045403
4,claude_simple,0.714512,0.031391
5,claude_one_example,0.716474,0.038162
6,Pytesseract,0.71978,0.019373
7,gpt_two_example,0.726813,0.145369
8,claude_two_example,0.727934,0.021076
9,claude_refine_complex,0.736931,0.170093


In [55]:
models_bleu_whole = agg_bleu_whole['model'].unique() # Already sorted by mean
models_cer_whole = agg_cer_whole['model'].unique() # Already sorted by mean
paired_t_bleu_whole = ttest(bleu_whole_normalized, models_bleu_whole, score = 'bleu', ind='id')
paired_t_cer_whole = ttest(cer_whole_normalized, models_cer_whole, score = 'cer', ind='id')

In [56]:
paired_t_bleu_whole

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_complex,claude_complex,0.02265,0.982178,Not Significant
1,claude_complex,gpt_refine_complex,0.097869,0.923118,Not Significant
2,gpt_refine_complex,gpt_two_example,0.708038,0.489109,Not Significant
3,gpt_two_example,gpt_one_example,0.676515,0.508376,Not Significant
4,gpt_one_example,claude_refine_complex,3.203083,0.005213,Significant
5,claude_refine_complex,Pytesseract,1.923254,0.070413,Not Significant
6,Pytesseract,claude_one_example,2.854551,0.010968,Significant
7,claude_one_example,claude_two_example,1.320977,0.205091,Not Significant
8,claude_two_example,claude_simple,1.016916,0.324325,Not Significant
9,claude_simple,EasyOCR,1.142991,0.268017,Not Significant


In [57]:
paired_t_cer_whole

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,claude_complex,gpt_refine_complex,-3.311491,0.003882,Significant
1,gpt_refine_complex,gpt_complex,-0.09701,0.92379,Not Significant
2,gpt_complex,gpt_one_example,-0.835826,0.414847,Not Significant
3,gpt_one_example,claude_simple,-0.373086,0.713695,Not Significant
4,claude_simple,claude_one_example,-0.108966,0.914505,Not Significant
5,claude_one_example,Pytesseract,-0.274473,0.787029,Not Significant
6,Pytesseract,gpt_two_example,-0.234591,0.817502,Not Significant
7,gpt_two_example,claude_two_example,-0.031813,0.975015,Not Significant
8,claude_two_example,claude_refine_complex,-0.369294,0.71675,Not Significant
9,claude_refine_complex,TrOCR20,-0.299094,0.769262,Not Significant


# Difference between Perline and Whole

In [None]:
# d1 = bleu_n3
# d2 = bleu_whole_normalized
d1 = cer_normalized
d2 = cer_whole_normalized

d1['source'] = 'Perline'
d2['source'] = 'Whole'

score= 'cer' #Change depending on the dataset

combined_bleu = pd.concat([d1, d2])

# perline_color = "rgba(143,206,0, 0.5)"  #green
whole_color = "rgba(67, 162, 202, 0.5)"  # semi-transparent blue
perline_color = "rgba(250, 159, 181, 0.3)"  # lighter pink with more transparency


fig = go.Figure()

for model in llm_order + ocr_order + ocr_ft_order:
    for source, color in zip(['Perline', 'Whole'], [perline_color, whole_color]):
        fig.add_trace(go.Box(
            x=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['model'],
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)][score],
            boxmean=True,
            marker=dict(color=color),
            showlegend=False  
        ))


ocr_start_index = len(llm_order)  
ocr_ft_start_index = ocr_start_index + len(ocr_order)  
llm_midpoint = ocr_start_index / 2  
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  

fig.update_layout(
    # title='BLEU Scores (Perline vs. Whole)',
    # xaxis_title='Model',
    yaxis_title= f'{score.upper()} Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    font_color = 'black',
    xaxis=dict(
        categoryorder='array',  
        categoryarray=llm_order + ocr_order + ocr_ft_order 
    ),
    yaxis = dict(range=[-0.5, 2]), # Zoom only for CER scores
    showlegend=False,
    # legend=dict(
    #             orientation="h",
    #             entrywidth=70,
    #             yanchor="bottom",
    #             y=1.02,
    #             xanchor="right",
    #             x=1),
    shapes=[
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper", 
            line=dict(color="black", width=1)
        )
    ],
    annotations=[
        dict(
            x=llm_midpoint,  
            y=1.2,  
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        dict(
            x=ocr_midpoint, 
            y=1.2,  
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

fig.show()


# vs. Human Evaluation

In [38]:
human = pd.read_excel(path + '/doc/Ranking transcription (Whole-scan Experiments).xlsx', sheet_name='Averages')

In [64]:
agg_bleu_whole

Unnamed: 0,model,mean,std
0,gpt_complex,0.254013,0.096907
1,claude_complex,0.253398,0.062715
2,gpt_refine_complex,0.25034,0.120085
3,gpt_two_example,0.222694,0.066262
4,gpt_one_example,0.209717,0.099903
5,claude_refine_complex,0.118911,0.104991
6,Pytesseract,0.070828,0.017068
7,claude_one_example,0.045468,0.03665
8,claude_two_example,0.038366,0.024823
9,claude_simple,0.0293,0.025115


In [69]:
human_bleu = pd.merge(human[['Method', 'Average Score']], agg_bleu_whole[['model', 'mean']], left_on='Method', right_on='model', how='inner')
human_bleu = human_bleu.rename(columns={'Average Score': 'Human', 'mean': 'BLEU'}).drop(columns='Method')

In [74]:
human_bleu_cer = pd.merge(human_bleu, agg_cer_whole[['model', 'mean']], on='model', how='inner')
human_bleu_cer = human_bleu_cer.rename(columns={'mean': 'CER'})

In [None]:
human_bleu_cer['cer_inverted'] = 1 - human_bleu_cer['CER']

In [None]:
from plotly.subplots import make_subplots
import statsmodels.api as sm


lowess_bleu = sm.nonparametric.lowess(human_bleu_cer['BLEU'], human_bleu_cer['Human'], frac=0.3)
lowess_cer = sm.nonparametric.lowess(human_bleu_cer['cer_inverted'], human_bleu_cer['Human'], frac=0.3)


fig = make_subplots(specs=[[{"secondary_y": True}]])


fig.add_trace(
    go.Scatter(
        x=human_bleu_cer['Human'], 
        y=human_bleu_cer['BLEU'], 
        mode='markers', 
        name='BLEU Score'
    ),
    secondary_y=False,
)

# Add LOWESS line for BLEU Score
fig.add_trace(
    go.Scatter(
        x=lowess_bleu[:, 0], 
        y=lowess_bleu[:, 1], 
        mode='lines', 
        name='BLEU LOWESS',
        line=dict(dash='dash')
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=human_bleu_cer['Human'], 
        y=human_bleu_cer['cer_inverted'], 
        mode='markers', 
        name='Inverted CER Score'
    ),
    secondary_y=True,
)

# Add LOWESS line for CER Score
fig.add_trace(
    go.Scatter(
        x=lowess_cer[:, 0], 
        y=lowess_cer[:, 1], 
        mode='lines', 
        name='Inverted CER LOWESS',
        line=dict(dash='dash')
    ),
    secondary_y=True

)

fig.update_layout(
    title="Human Score vs BLEU and Inverted CER Scores with LOWESS",
    xaxis_title="Human Score",
)

fig.update_yaxes(title_text="BLEU Score", secondary_y=False)
fig.update_yaxes(title_text="Inverted CER Score (1-CER)", secondary_y=True)

fig.show()
