# Plotting the results

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go

In [2]:
path = os.path.dirname(os.getcwd())

# Perline (Simple prompt LLMs vs. OCR/HTR)

In [3]:
# Files
bleu_scores = {}
cer_scores = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_perline'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_perline/' + file, index_col=0)
        name = file.split('.')[0]
        if file.startswith('bleu'):
            name = name[5:]
            bleu_scores[name] = read_file
        elif file.startswith('cer'):
            name = name[4:]
            cer_scores[name] = read_file

In [4]:
bleu_df = pd.DataFrame()  # Initialize df before the loop
for key in bleu_scores.keys():
    if bleu_scores[key].keys().isin(['bleu', 'id', 'file']).all():
        temp = bleu_scores[key][['bleu', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    else:
        temp = bleu_scores[key].reset_index(drop=False)
        temp = temp[['bleu', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    bleu_df = pd.concat([bleu_df, temp], axis=0, ignore_index=True)  # Use ignore_index=True to avoid reindexing issues


In [5]:
cer_scores
cer_df = pd.DataFrame()  # Initialize df before the loop
for key in cer_scores.keys():
    if cer_scores[key].keys().isin(['bleu', 'id', 'file']).all():
        temp = cer_scores[key][['cer', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    else:
        temp = cer_scores[key].reset_index(drop=False)
        temp = temp[['cer', 'id']].reset_index(drop=True)
        temp['cer']
        name = key.split('_perline')[0]
        temp['model'] = name
    cer_df = pd.concat([cer_df, temp], axis=0, ignore_index=True) 

In [6]:
cer_df['file'] = cer_df['id'].astype(str).apply(lambda x: x.split('_')[0])
cer_df['file'] = cer_df['file'].astype(int)
cer_df

Unnamed: 0,cer,id,model,file
0,0.804428,1_0,Pytesseract,1
1,0.853659,1_1,Pytesseract,1
2,1.000000,1_2,Pytesseract,1
3,0.762963,1_3,Pytesseract,1
4,0.873563,1_4,Pytesseract,1
...,...,...,...,...
5984,0.276423,20_9,claude_refine_complex,20
5985,0.310345,20_10,claude_refine_complex,20
5986,0.025641,20_11,claude_refine_complex,20
5987,0.027778,20_12,claude_refine_complex,20


In [7]:
cer_df['model'].unique()

array(['Pytesseract', 'claude_two_text_example', 'gpt_two_text_example',
       'gpt_complex', 'claude_one_example', 'gpt_two_example', 'KerasOCR',
       'claude_refine', 'gpt_refine', 'claude_simple', 'gpt_simple',
       'TrOCR20', 'TrOCR50', 'claude_complex', 'TrOCR',
       'claude_two_example', 'gpt_one_example', 'claude_one_text_example',
       'EasyOCR', 'gpt_refine_complex', 'gpt_one_text_example',
       'claude_refine_complex'], dtype=object)

In [78]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
            #  'gpt_refine', 'claude_refine', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=bleu_df[bleu_df['model'] == model]['model'], 
            y=bleu_df[bleu_df['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=bleu_df[bleu_df['model'] == model]['model'], 
            y=bleu_df[bleu_df['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=bleu_df[bleu_df['model'] == model]['model'], 
        y=bleu_df[bleu_df['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=bleu_df[bleu_df['model'] == model]['model'], 
        y=bleu_df[bleu_df['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='BLEU Scores',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order + ocr_ft_order # Concatenate the model orders
    ),
    showlegend=False,
    # legend=dict(
    #             orientation="h",
    #             entrywidth=70,
    #             yanchor="bottom",
    #             y=1.02,
    #             xanchor="right",
    #             x=1),
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=24)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=24)
        )
    ]
)
# fig.update_xaxes(rangeselector_font_size=10)

fig.show()



In [79]:
# fig.write_image(path + '/results/plots/bleu_scores_perline.pdf')

In [76]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
            #  'gpt_refine', 'claude_refine', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']

gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=cer_df[cer_df['model'] == model]['model'], 
            y=cer_df[cer_df['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=cer_df[cer_df['model'] == model]['model'], 
            y=cer_df[cer_df['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=cer_df[cer_df['model'] == model]['model'], 
        y=cer_df[cer_df['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=cer_df[cer_df['model'] == model]['model'], 
        y=cer_df[cer_df['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text


# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='CER Scores',
    xaxis_title='Model',
    yaxis_title='CER Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order + ocr_ft_order,  # Concatenate the model orders,
        
    ),
    yaxis = dict(range=[-0.5, 2]),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

#### Don't ADD this when exporting (too crowded)
# for model in llm_order + ocr_order + ocr_ft_order:
#     model_data = cer_df[cer_df['model'] == model]['cer'].dropna()  
#     max_value = model_data.max()  # Calculate max
#     med_value = model_data.median()  # Calculate median
#     variance = model_data.var()  # Calculate variance
    
#     fig.add_annotation(
#         x=model,  # Model name on x-axis
#         y=1,  # Max value on y-axis
#         text=f'{max_value:.2f} <br> ({variance:.2f})',  # Format the text
#         showarrow=False,
#         yshift=10,  # Position the text above the max value
#         font=dict(size=10)
#     )

fig.show()



In [77]:
# fig.write_image(path + '/results/plots/cer_scores_perline_zoomed.pdf')

# Whole Scans

In [21]:
bleu_scores_whole = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_whole/BLEU'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_whole/BLEU/' + file)
        name = file.split('.')[0]
        name = name.split('scores_')[1]
        bleu_scores_whole[name] = read_file

In [23]:
bleu_df_whole = pd.DataFrame()  
for key in bleu_scores_whole.keys():
    temp = bleu_scores_whole[key].reset_index(drop=True)
    temp.columns = ['file', 'bleu']
    temp['model'] = key
    bleu_df_whole = pd.concat([bleu_df_whole, temp], axis=0, ignore_index=True) 

In [25]:
bleu_df_whole

Unnamed: 0,file,bleu,model
0,0,0.286144,gpt_one_example
1,1,0.058323,gpt_one_example
2,2,0.425044,gpt_one_example
3,3,0.258140,gpt_one_example
4,4,0.000000,gpt_one_example
...,...,...,...
355,14,0.000000,KerasOCR
356,15,0.000000,KerasOCR
357,16,0.000000,KerasOCR
358,17,0.000000,KerasOCR


In [26]:
cer_scores_whole = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_whole/CER'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_whole/CER/' + file)
        name = file.split('.')[0]
        name = name.split('scores_')[1]
        cer_scores_whole[name] = read_file
        
cer_df_whole = pd.DataFrame() 
for key in cer_scores_whole.keys():
    temp = cer_scores_whole[key].reset_index(drop=True)
    temp.columns = ['file', 'cer']
    temp['model'] = key
    cer_df_whole = pd.concat([cer_df_whole, temp], axis=0, ignore_index=True) 

In [38]:
cer_df_whole['model'].unique()

array(['claude_simple', 'gpt_two_example', 'gpt_two_text_example',
       'gpt_one_example', 'claude_one_text_example', 'EasyOCR',
       'gpt_complex', 'claude_one_example', 'claude_refine_complex',
       'claude_complex', 'claude_two_example', 'Pytesseract',
       'gpt_simple', 'claude_two_text_example', 'gpt_one_text_example',
       'TrOCR', 'gpt_refine_complex', 'KerasOCR', 'TrOCR20'], dtype=object)

In [74]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
             'gpt_refine_complex', 'claude_refine_complex']

ocr_order = ['EasyOCR', 'Pytesseract', 'TrOCR', 'KerasOCR'] 
ocr_ft_order = ['TrOCR20']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
# ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
            y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    elif 'claude' in model:
        fig.add_trace(go.Box(
            x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
            y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
        y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
        y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
# ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='BLEU Scores',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order  #+ ocr_ft_order  # Concatenate the model orders
    ),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

fig.show()



In [75]:
# fig.write_image(path + '/results/plots/bleu_scores_whole.pdf')

In [70]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'TrOCR', 'KerasOCR'] 
ocr_ft_order = ['TrOCR20']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
# ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
            y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
            y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
        y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
        y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
# ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='cer Scores',
    xaxis_title='Model',
    yaxis_title='CER Score',
    yaxis = dict(range=[-0.5, 2]),
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order  #+ ocr_ft_order  # Concatenate the model orders
    ),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

fig.show()



In [73]:
# fig.write_image(path + '/results/plots/cer_scores_zoomed_whole.pdf')

# T-test (Comparisons between the models)

In [22]:
agg_bleu = bleu_df.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu.columns = ['model', 'mean', 'std']

In [23]:
agg_bleu = agg_bleu.sort_values('mean', ascending=False).reset_index(drop=True)
agg_bleu


Unnamed: 0,model,mean,std
0,gpt_one_text_example,0.287856,0.343911
1,gpt_two_text_example,0.280586,0.311878
2,gpt_complex,0.277885,0.349838
3,claude_complex,0.253559,0.369379
4,gpt_refine_complex,0.241407,0.291929
5,claude_two_example,0.198892,0.351342
6,claude_two_text_example,0.198502,0.333492
7,claude_refine_complex,0.194788,0.297527
8,gpt_refine,0.159854,0.234093
9,claude_one_text_example,0.159015,0.300077


In [24]:
agg_cer = cer_df.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer.columns = ['model', 'mean', 'std']
agg_cer = agg_cer.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer

Unnamed: 0,model,mean,std
0,gpt_complex,0.242133,0.191534
1,gpt_refine_complex,0.252526,0.189149
2,gpt_two_text_example,0.256696,0.218048
3,gpt_one_text_example,0.258974,0.220631
4,claude_complex,0.288068,0.221569
5,claude_two_text_example,0.309579,0.222295
6,claude_one_text_example,0.356029,0.272368
7,gpt_refine,0.429612,0.479808
8,gpt_simple,0.452401,0.496205
9,claude_refine,0.557395,0.584668


In [43]:
import scipy.stats as stats
import itertools

In [41]:
def ttest(score_df, models, score = 'bleu', ind = 'id'):

    paired_t = []

    for model1, model2 in itertools.pairwise(models): #if you want all pairs with respect to the order
    # for model1, model2 in itertools.combinations(models, 2): #if you want all pairs
        t1 = score_df[score_df['model'] == model1].set_index(ind)[score]
        t2 = score_df[score_df['model'] == model2].set_index(ind)[score]

        # Because the outputs of the 1 or 2 example prompts have different lengths as the others.
        paired_scores = t1.align(t2, join='inner')
        
        t_stat, p_value = stats.ttest_rel(paired_scores[0], paired_scores[1])
        paired_t.append({'Model 1': model1, 'Model 2': model2, 't-statistic': t_stat, 'p-value': p_value})

        paired_t_df = pd.DataFrame(paired_t)
        paired_t_df['significance'] = paired_t_df['p-value'].apply(lambda x: 'Significant' if x < 0.05 else 'Not Significant')

    return paired_t_df

In [27]:
models_bleu = agg_bleu['model'].unique() # Already sorted by mean
models_cer = agg_cer['model'].unique() # Already sorted by mean

In [28]:
paired_t_bleu = ttest(bleu_df, models_bleu, score = 'bleu', ind='id')
paired_t_cer = ttest(cer_df, models_cer, score = 'cer', ind='id')

In [29]:
paired_t_bleu

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_one_text_example,gpt_two_text_example,0.676586,0.499227,Not Significant
1,gpt_two_text_example,gpt_complex,0.036004,0.9713051,Not Significant
2,gpt_complex,claude_complex,1.30724,0.1921961,Not Significant
3,claude_complex,gpt_refine_complex,0.657448,0.5114293,Not Significant
4,gpt_refine_complex,claude_two_example,1.961233,0.05084277,Not Significant
5,claude_two_example,claude_two_text_example,0.19404,0.8462855,Not Significant
6,claude_two_text_example,claude_refine_complex,0.280755,0.7791055,Not Significant
7,claude_refine_complex,gpt_refine,1.916198,0.05635067,Not Significant
8,gpt_refine,claude_one_text_example,0.072173,0.9425158,Not Significant
9,claude_one_text_example,gpt_simple,0.184546,0.8537182,Not Significant


In [30]:
paired_t_cer

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_complex,gpt_refine_complex,-3.300705,0.00108852,Significant
1,gpt_refine_complex,gpt_two_text_example,-0.58356,0.5599865,Not Significant
2,gpt_two_text_example,gpt_one_text_example,-0.384228,0.7011013,Not Significant
3,gpt_one_text_example,claude_complex,-4.128505,4.818176e-05,Significant
4,claude_complex,claude_two_text_example,-2.766154,0.006049534,Significant
5,claude_two_text_example,claude_one_text_example,-4.453484,1.222009e-05,Significant
6,claude_one_text_example,gpt_refine,-2.579046,0.0104161,Significant
7,gpt_refine,gpt_simple,-3.054528,0.002469705,Significant
8,gpt_simple,claude_refine,-6.285823,1.235277e-09,Significant
9,claude_refine,claude_two_example,-0.127236,0.8988447,Not Significant


In [72]:
# sig = paired_t_df[paired_t_df['p-value'] < 0.05].sort_values('p-value') 
# paired t-test is based on: Model1 - Model2 
# if Model1 - Model2 > 0, t is larger, p-value is small.

In [73]:
# pd.crosstab(sig['Model 1'], sig['Model 2'])

Model 2,TrOCR,TrOCR20,claude_refine,claude_two_example
Model 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude_complex,0,0,0,1
claude_simple,0,1,0,0
gpt_simple,0,0,1,0
pytesseractOCR,1,0,0,0


In [90]:
# paired_t_cer.to_csv(path + '/results/scores_comparisons/paired_t-test_cer_allcombi_perline.csv', index=False)

## Whole Scans

In [36]:
agg_bleu_whole = bleu_df_whole.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu_whole.columns = ['model', 'mean', 'std']
agg_bleu_whole = agg_bleu_whole.sort_values('mean', ascending=False).reset_index(drop=True) #ascending=True because lower bleu means better transcribed.
agg_bleu_whole

Unnamed: 0,model,mean,std
0,gpt_two_example,0.296086,0.055355
1,claude_two_example,0.266913,0.050648
2,claude_complex,0.253886,0.063031
3,gpt_one_example,0.244748,0.130801
4,claude_one_example,0.240243,0.057789
5,claude_two_text_example,0.203905,0.086475
6,claude_one_text_example,0.165723,0.074418
7,gpt_refine_complex,0.163442,0.071983
8,gpt_complex,0.157995,0.078886
9,claude_refine_complex,0.151734,0.10701


In [39]:
agg_cer_whole = cer_df_whole.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer_whole.columns = ['model', 'mean', 'std']
agg_cer_whole = agg_cer_whole.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer_whole

Unnamed: 0,model,mean,std
0,claude_two_example,0.43403,0.032723
1,claude_one_example,0.494389,0.112166
2,claude_two_text_example,0.584439,0.074666
3,gpt_two_example,0.641059,0.313478
4,gpt_one_example,0.653518,0.177353
5,claude_complex,0.683409,0.111397
6,claude_one_text_example,0.695984,0.428716
7,gpt_two_text_example,0.699361,0.118269
8,claude_refine_complex,0.757687,0.245322
9,TrOCR20,0.760872,0.010756


In [44]:
models_bleu_whole = agg_bleu_whole['model'].unique() # Already sorted by mean
models_cer_whole = agg_cer_whole['model'].unique() # Already sorted by mean
paired_t_bleu_whole = ttest(bleu_df_whole, models_bleu_whole, score = 'bleu', ind='file')
paired_t_cer_whole = ttest(cer_df_whole, models_cer_whole, score = 'cer', ind='file')

In [45]:
paired_t_bleu_whole

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_two_example,claude_two_example,2.407916,0.027679,Significant
1,claude_two_example,claude_complex,1.503527,0.151053,Not Significant
2,claude_complex,gpt_one_example,0.175291,0.862808,Not Significant
3,gpt_one_example,claude_one_example,0.138469,0.891407,Not Significant
4,claude_one_example,claude_two_text_example,1.406545,0.17758,Not Significant
5,claude_two_text_example,claude_one_text_example,1.780235,0.092917,Not Significant
6,claude_one_text_example,gpt_refine_complex,0.341631,0.736585,Not Significant
7,gpt_refine_complex,gpt_complex,0.315442,0.755865,Not Significant
8,gpt_complex,claude_refine_complex,0.228799,0.82147,Not Significant
9,claude_refine_complex,claude_simple,3.513893,0.002321,Significant


In [46]:
paired_t_cer_whole

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,claude_two_example,claude_one_example,-2.157244,0.04559072,Significant
1,claude_one_example,claude_two_text_example,-2.579722,0.01947933,Significant
2,claude_two_text_example,gpt_two_example,-0.781143,0.4454611,Not Significant
3,gpt_two_example,gpt_one_example,-0.096708,0.9240894,Not Significant
4,gpt_one_example,claude_complex,-0.664736,0.5146463,Not Significant
5,claude_complex,claude_one_text_example,-0.063615,0.9499781,Not Significant
6,claude_one_text_example,gpt_two_text_example,-0.010098,0.9920605,Not Significant
7,gpt_two_text_example,claude_refine_complex,-1.314819,0.2060326,Not Significant
8,claude_refine_complex,TrOCR20,-0.16986,0.86755,Not Significant
9,TrOCR20,KerasOCR,-5.842647,5.754732e-05,Significant
