# Plotting the results

In [2]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go

In [3]:
path = os.path.dirname(os.getcwd())

# Perline (Simple prompt LLMs vs. OCR/HTR)

In [3]:
# Files
bleu_scores = {}
cer_scores = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_perline'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_perline/' + file, index_col=0)
        name = file.split('.')[0]
        if file.startswith('bleu'):
            name = name[5:]
            bleu_scores[name] = read_file
        elif file.startswith('cer'):
            name = name[4:]
            cer_scores[name] = read_file

In [4]:
bleu_df = pd.DataFrame()  # Initialize df before the loop
for key in bleu_scores.keys():
    if bleu_scores[key].keys().isin(['bleu', 'id', 'file']).all():
        temp = bleu_scores[key][['bleu', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    else:
        temp = bleu_scores[key].reset_index(drop=False)
        temp = temp[['bleu', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    bleu_df = pd.concat([bleu_df, temp], axis=0, ignore_index=True)  # Use ignore_index=True to avoid reindexing issues


In [5]:
cer_scores
cer_df = pd.DataFrame()  # Initialize df before the loop
for key in cer_scores.keys():
    if cer_scores[key].keys().isin(['bleu', 'id', 'file']).all():
        temp = cer_scores[key][['cer', 'id']].reset_index(drop=True)
        name = key.split('_perline')[0]
        temp['model'] = name
    else:
        temp = cer_scores[key].reset_index(drop=False)
        temp = temp[['cer', 'id']].reset_index(drop=True)
        temp['cer']
        name = key.split('_perline')[0]
        temp['model'] = name
    cer_df = pd.concat([cer_df, temp], axis=0, ignore_index=True) 

In [6]:
cer_df['file'] = cer_df['id'].astype(str).apply(lambda x: x.split('_')[0])
cer_df['file'] = cer_df['file'].astype(int)
cer_df

Unnamed: 0,cer,id,model,file
0,0.804428,1_0,Pytesseract,1
1,0.853659,1_1,Pytesseract,1
2,1.000000,1_2,Pytesseract,1
3,0.762963,1_3,Pytesseract,1
4,0.873563,1_4,Pytesseract,1
...,...,...,...,...
5984,0.276423,20_9,claude_refine_complex,20
5985,0.310345,20_10,claude_refine_complex,20
5986,0.025641,20_11,claude_refine_complex,20
5987,0.027778,20_12,claude_refine_complex,20


In [7]:
bleu_df[bleu_df['model'] == 'TrOCR50']

Unnamed: 0,bleu,id,model
3315,0.000000,1_4,TrOCR50
3316,0.488923,1_8,TrOCR50
3317,0.000000,1_9,TrOCR50
3318,0.000000,1_10,TrOCR50
3319,0.000000,1_11,TrOCR50
...,...,...,...
3441,0.000000,19_12,TrOCR50
3442,0.000000,20_5,TrOCR50
3443,0.000000,20_8,TrOCR50
3444,0.000000,20_9,TrOCR50


## Plots

In [8]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
            #  'gpt_refine', 'claude_refine', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=bleu_df[bleu_df['model'] == model]['model'], 
            y=bleu_df[bleu_df['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=bleu_df[bleu_df['model'] == model]['model'], 
            y=bleu_df[bleu_df['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=bleu_df[bleu_df['model'] == model]['model'], 
        y=bleu_df[bleu_df['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=bleu_df[bleu_df['model'] == model]['model'], 
        y=bleu_df[bleu_df['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='BLEU Scores',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order + ocr_ft_order # Concatenate the model orders
    ),
    showlegend=False,
    # legend=dict(
    #             orientation="h",
    #             entrywidth=70,
    #             yanchor="bottom",
    #             y=1.02,
    #             xanchor="right",
    #             x=1),
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=24)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=24)
        )
    ]
)
# fig.update_xaxes(rangeselector_font_size=10)

fig.show()



In [79]:
# fig.write_image(path + '/results/plots/bleu_scores_perline.pdf')

In [76]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
            #  'gpt_refine', 'claude_refine', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']

gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=cer_df[cer_df['model'] == model]['model'], 
            y=cer_df[cer_df['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=cer_df[cer_df['model'] == model]['model'], 
            y=cer_df[cer_df['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=cer_df[cer_df['model'] == model]['model'], 
        y=cer_df[cer_df['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=cer_df[cer_df['model'] == model]['model'], 
        y=cer_df[cer_df['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text


# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='CER Scores',
    xaxis_title='Model',
    yaxis_title='CER Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order + ocr_ft_order,  # Concatenate the model orders,
        
    ),
    yaxis = dict(range=[-0.5, 2]),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

#### Don't ADD this when exporting (too crowded)
# for model in llm_order + ocr_order + ocr_ft_order:
#     model_data = cer_df[cer_df['model'] == model]['cer'].dropna()  
#     max_value = model_data.max()  # Calculate max
#     med_value = model_data.median()  # Calculate median
#     variance = model_data.var()  # Calculate variance
    
#     fig.add_annotation(
#         x=model,  # Model name on x-axis
#         y=1,  # Max value on y-axis
#         text=f'{max_value:.2f} <br> ({variance:.2f})',  # Format the text
#         showarrow=False,
#         yshift=10,  # Position the text above the max value
#         font=dict(size=10)
#     )

fig.show()



In [77]:
# fig.write_image(path + '/results/plots/cer_scores_perline_zoomed.pdf')

## N==3 & N==2 BLEU for perline

In [17]:
bleu_n4 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n4_normalized.csv')
bleu_n3 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_normalized.csv')
bleu_n2 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n2_normalized.csv')

In [20]:
cer_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_normalized.csv') #both lower and uncapitalized

In [29]:
data = bleu_n3
score = 'bleu'
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example', 
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
            #  'gpt_refine', 'claude_refine', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR'] 
ocr_ft_order = ['TrOCR20', 'TrOCR50']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=data[data['model'] == model]['model'], 
            y=data[data['model'] == model][score],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=data[data['model'] == model]['model'], 
            y=data[data['model'] == model][score],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=data[data['model'] == model]['model'], 
        y=data[data['model'] == model][score],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=data[data['model'] == model]['model'], 
        y=data[data['model'] == model][score],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='BLEU Scores (N=3)',
    # xaxis_title='Model',
    yaxis_title='BLEU Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order + ocr_ft_order # Concatenate the model orders
    ),
    # yaxis = dict(range=[-0.5, 2]),
    showlegend=False,
    # legend=dict(
    #             orientation="h",
    #             entrywidth=70,
    #             yanchor="bottom",
    #             y=1.02,
    #             xanchor="right",
    #             x=1),
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)
# fig.update_xaxes(rangeselector_font_size=10)

fig.show()



In [27]:
fig.write_image(path + '/results/plots/cer_scores_perline_normalized.eps')

In [9]:

# Add 'source' column to differentiate datasets
bleu_n2['source'] = 'N2'
bleu_n3['source'] = 'N3'
bleu_n4['source'] = 'N4'

# Combine the two datasets
combined_bleu = pd.concat([bleu_n2, bleu_n3, bleu_n4])

# Set colors for each source with slightly different transparency levels
n2_color = "rgba(143,206,0, 0.5)"  #green
n3_color = "rgba(67, 162, 202, 0.5)"  # semi-transparent blue
df_color = "rgba(250, 159, 181, 0.3)"  # lighter pink with more transparency

# Create the figure
fig = go.Figure()

# Loop through models for both sources
for model in llm_order + ocr_order + ocr_ft_order:
    for source, color in zip(['N2', 'N3', 'N4'], [n2_color, n3_color, df_color]):
        fig.add_trace(go.Box(
            x=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['model'],
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu'],
            # name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False  # Hide the legend
        ))

# Add dummy traces for the legend
fig.add_trace(go.Box(
    y=[None],  # No data
    name='N2',
    marker=dict(color=n2_color),
    showlegend=True  # Show this trace in the legend
))

fig.add_trace(go.Box(
    y=[None],  # No data
    name='N3',
    marker=dict(color=n3_color),
    showlegend=True  # Show this trace in the legend
))

fig.add_trace(go.Box(
    y=[None],  # No data
    name='N4',
    marker=dict(color=df_color),
    showlegend=True  # Show this trace in the legend
))

# Update layout and annotations as in your original code
fig.update_layout(
    title='Comparison of BLEU Scores (N=2 vs. N=3 vs. N=4, All normalized)',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',
        categoryarray=llm_order + ocr_order + ocr_ft_order
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) - 0.5,
            x1=len(llm_order) - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=2)
        )
    ]
)

fig.show()


## Impact of different normalizations for perline

In [13]:
bleu_n3 = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_onlystripped.csv')
bleu_n3_lower = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_lowered.csv')
bleu_n3_unidecoded = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_unidecoded.csv')
bleu_n3_normalized = pd.read_csv(path + '/results/scores_comparisons/bleu_perline_all_n3_normalized.csv') #both lower and uncapitalized

cer_all = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_onlystripped.csv')
cer_lower = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_lowered.csv')
cer_unidecoded = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_unidecoded.csv')
cer_normalized = pd.read_csv(path + '/results/scores_comparisons/cer_perline_all_normalized.csv') #both lower and uncapitalized

In [16]:
import plotly.graph_objects as go
import pandas as pd

# Add 'source' column to differentiate datasets
bleu_n3['source'] = 'Only Stripped'
bleu_n3_lower['source'] = 'Lowered'
bleu_n3_unidecoded['source'] = 'Unidecoded'
bleu_n3_normalized['source'] = 'Normalized'

# Combine the datasets
combined_bleu = pd.concat([bleu_n3, bleu_n3_lower, bleu_n3_unidecoded, bleu_n3_normalized])

# Define orders for the models and OCR methods
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR']
ocr_ft_order = ['TrOCR20', 'TrOCR50']
model_order = llm_order + ocr_order + ocr_ft_order

# Define colors for each source
n3_color = "rgba(0,0,0, 0.5)"  # Gray
lower_color = "rgba(143,206,0, 0.5)"  # Green
unide_color = "rgba(67, 162, 202, 0.5)"  # Blue
norm_color = "rgba(250, 159, 181, 0.3)"  # Pink

# Mapping for source colors
source_colors = {
    'Only Stripped': n3_color,
    'Lowered': lower_color,
    'Unidecoded': unide_color,
    'Normalized': norm_color
}

# Create the figure
fig = go.Figure()

# Track the x-axis positions
x_positions = []
tick_labels = []

# Loop through models and sources
x_index = 0
for model in model_order:
    for source in ['Only Stripped', 'Lowered', 'Unidecoded', 'Normalized']:
        color = source_colors[source]
        # Add box plot trace
        fig.add_trace(go.Box(
            x=[x_index] * len(combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu']),
            y=combined_bleu[(combined_bleu['model'] == model) & (combined_bleu['source'] == source)]['bleu'],
            name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False
        ))
        x_positions.append(x_index)
        x_index += 1
    
    # Add the model label at the middle of the group
    tick_labels.append(model)

# Add legend entries manually for the sources
for source, color in source_colors.items():
    fig.add_trace(go.Box(
        y=[None],  # No data
        name=source,
        marker=dict(color=color),
        showlegend=True
    ))

# Update layout
fig.update_layout(
    title='Comparison of BLEU Scores for Different Models (Lowered vs. Unidecoded vs. Normalized)',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        tickvals=[4 * i + 1.5 for i in range(len(model_order))],  # Center the label for each model
        ticktext=tick_labels,
        tickangle=45
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) * 4 - 0.5,
            x1=len(llm_order) * 4 - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=2)
        )
    ]
)

fig.show()


In [None]:
fig.write_image(path + '/results/plots/bleu_scores_perline_stringpreprocessing_comparisons.eps')

In [76]:
import plotly.graph_objects as go
import pandas as pd

# Add 'source' column to differentiate datasets

cer_all['source'] = 'CER'
cer_lower['source'] = 'Lowered'
cer_unidecoded['source'] = 'Unidecoded'
cer_normalized['source'] = 'Normalized'

# Combine the datasets
combined_cer = pd.concat([cer_all, cer_lower, cer_unidecoded, cer_normalized])

# Define orders for the models and OCR methods
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'KerasOCR', 'TrOCR']
ocr_ft_order = ['TrOCR20', 'TrOCR50']
model_order = llm_order + ocr_order + ocr_ft_order

# Define colors for each source
n3_color = "rgba(0,0,0, 0.5)"  # Gray
lower_color = "rgba(143,206,0, 0.5)"  # Green
unide_color = "rgba(67, 162, 202, 0.5)"  # Blue
norm_color = "rgba(250, 159, 181, 0.3)"  # Pink

# Mapping for source colors
source_colors = {
    'CER': n3_color,
    'Lowered': lower_color,
    'Unidecoded': unide_color,
    'Normalized': norm_color
}

# Create the figure
fig = go.Figure()

# Track the x-axis positions
x_positions = []
tick_labels = []

# Loop through models and sources
x_index = 0
for model in model_order:
    for source in ['CER', 'Lowered', 'Unidecoded', 'Normalized']:
        color = source_colors[source]
        # Add box plot trace
        fig.add_trace(go.Box(
            x=[x_index] * len(combined_cer[(combined_cer['model'] == model) & (combined_cer['source'] == source)]['cer']),
            y=combined_cer[(combined_cer['model'] == model) & (combined_cer['source'] == source)]['cer'],
            name=f"{model} ({source})",
            boxmean=True,
            marker=dict(color=color),
            showlegend=False
        ))
        x_positions.append(x_index)
        x_index += 1
    
    # Add the model label at the middle of the group
    tick_labels.append(model)

# Add legend entries manually for the sources
for source, color in source_colors.items():
    fig.add_trace(go.Box(
        y=[None],  # No data
        name=source,
        marker=dict(color=color),
        showlegend=True
    ))

# Update layout
fig.update_layout(
    title='Comparison of CER Scores for Different Models (Lowered vs. Unidecoded vs. Normalized)',
    xaxis_title='Model',
    yaxis_title='CER Score',
    yaxis = dict(range=[-0.5, 2]),
    plot_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        tickvals=[4 * i + 1.5 for i in range(len(model_order))],  # Center the label for each model
        ticktext=tick_labels,
        tickangle=45
    ),
    shapes=[
        dict(
            type="line",
            x0=len(llm_order) * 4 - 0.5,
            x1=len(llm_order) * 4 - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",
            line=dict(color="black", width=2)
        )
    ]
)

fig.show()


In [None]:
# to-do: Check whether the difference is significant or not for BLEU and CER scores (Which one is more largely affected by the preprocessing?)

# Whole Scans

In [34]:
bleu_scores_whole = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_whole/BLEU'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_whole/BLEU/' + file)
        name = file.split('.')[0]
        name = name.split('scores_')[1]
        bleu_scores_whole[name] = read_file

In [35]:
bleu_df_whole = pd.DataFrame()  
for key in bleu_scores_whole.keys():
    print(key + ' is being processed')
    temp = bleu_scores_whole[key].reset_index(drop=True)
    temp.columns = ['file', 'bleu']
    temp['model'] = key
    bleu_df_whole = pd.concat([bleu_df_whole, temp], axis=0, ignore_index=True) 

gpt_one_example is being processed
TrOCR20 is being processed
claude_simple is being processed
gpt_refine_complex is being processed
claude_refine_complex is being processed
gpt_two_example is being processed
TrOCR is being processed
gpt_simple is being processed
gpt_one_text_example is being processed
claude_two_text_example is being processed
gpt_complex is being processed
claude_complex is being processed
claude_one_example is being processed
claude_one_text_example is being processed
gpt_two_text_example is being processed
EasyOCR is being processed
Pytesseract is being processed
claude_two_example is being processed
KerasOCR is being processed


In [36]:
cer_scores_whole = {}
for file in os.listdir(path + '/results/scores_comparisons/eval_whole/CER'):
    if file.endswith('.csv'):
        read_file = pd.read_csv(path + '/results/scores_comparisons/eval_whole/CER/' + file)
        name = file.split('.')[0]
        name = name.split('scores_')[1]
        cer_scores_whole[name] = read_file
        
cer_df_whole = pd.DataFrame() 
for key in cer_scores_whole.keys():
    temp = cer_scores_whole[key].reset_index(drop=True)
    temp.columns = ['file', 'cer']
    temp['model'] = key
    cer_df_whole = pd.concat([cer_df_whole, temp], axis=0, ignore_index=True) 

## Plots

In [37]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
             'gpt_refine_complex', 'claude_refine_complex']

ocr_order = ['EasyOCR', 'Pytesseract', 'TrOCR', 'KerasOCR'] 
ocr_ft_order = ['TrOCR20']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
# ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
            y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    elif 'claude' in model:
        fig.add_trace(go.Box(
            x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
            y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
        y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=bleu_df_whole[bleu_df_whole['model'] == model]['model'], 
        y=bleu_df_whole[bleu_df_whole['model'] == model]['bleu'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
# ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='BLEU Scores',
    xaxis_title='Model',
    yaxis_title='BLEU Score',
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order  #+ ocr_ft_order  # Concatenate the model orders
    ),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

fig.show()



In [None]:
# fig.write_image(path + '/results/plots/bleu_scores_whole.eps')

In [38]:
llm_order = ['gpt_simple', 'claude_simple', 'gpt_complex', 'claude_complex',
             'gpt_one_example', 'claude_one_example', 'gpt_two_example', 'claude_two_example',
            #  'gpt_one_text_example', 'claude_one_text_example', 'gpt_two_text_example', 'claude_two_text_example', 
             'gpt_refine_complex', 'claude_refine_complex']
ocr_order = ['EasyOCR', 'Pytesseract', 'TrOCR', 'KerasOCR'] 
ocr_ft_order = ['TrOCR20']


gpt_color = px.colors.qualitative.Set2[0]
claude_color = px.colors.qualitative.Set2[1]
ocr_color = px.colors.qualitative.Set2[2]
# ocr_ft_color = px.colors.qualitative.Set2[3]


fig = go.Figure()

for model in llm_order:
    if 'gpt' in model:
        fig.add_trace(go.Box(
            x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
            y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=gpt_color) 
        ))
    else:
        fig.add_trace(go.Box(
            x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
            y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
            name=model,
            boxmean=True,
            marker=dict(color=claude_color) 
        ))
# Add traces for OCR models with a different color
for model in ocr_order:
    fig.add_trace(go.Box(
        x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
        y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_color)  # Assign OCR color
    ))

# Add traces for OCR fine-tuned models with another color
for model in ocr_ft_order:
    fig.add_trace(go.Box(
        x=cer_df_whole[cer_df_whole['model'] == model]['model'], 
        y=cer_df_whole[cer_df_whole['model'] == model]['cer'],
        name=model,
        boxmean=True,
        marker=dict(color=ocr_ft_color)  # Assign OCR fine-tuned color
    ))

# Get the start and midpoint of each group
ocr_start_index = len(llm_order)  # The first position of ocr_order
# ocr_ft_start_index = ocr_start_index + len(ocr_order)  # Start of ocr_ft_order
llm_midpoint = ocr_start_index / 2  # Midpoint of LLM models for placing the text
ocr_midpoint = ocr_start_index + (len(ocr_order) / 2)  # Midpoint of OCR models for placing the text

# Update the layout with custom category ordering, vertical line, and annotations
fig.update_layout(
    # title='cer Scores',
    xaxis_title='Model',
    yaxis_title='CER Score',
    yaxis = dict(range=[-0.5, 2]),
    # margin=dict(l=10, r=10, t=10, b=10),
    plot_bgcolor='rgba(0,0,0,0)',
    # paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(
        categoryorder='array',  # Set ordering to be custom
        categoryarray=llm_order + ocr_order  #+ ocr_ft_order  # Concatenate the model orders
    ),
    showlegend=False,
    shapes=[
        # Add a vertical line between LLM and OCR models
        dict(
            type="line",
            x0=ocr_start_index - 0.5,  # Place the line between the two groups
            x1=ocr_start_index - 0.5,
            y0=0,
            y1=1,
            xref="x",
            yref="paper",  # Stretch the line across the plot's full height
            line=dict(color="black", width=2)
        )
    ],
    annotations=[
        # Add annotation for LLMs above LLM models
        dict(
            x=llm_midpoint,  # Midpoint of LLM models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<LLMs>',
            showarrow=False,
            font=dict(size=14)
        ),
        # Add annotation for OCRs above OCR models
        dict(
            x=ocr_midpoint,  # Midpoint of OCR models
            y=1.2,  # Position above the plot
            xref='x',
            yref='paper',
            text='<OCRs>',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

fig.show()



In [73]:
# fig.write_image(path + '/results/plots/cer_scores_zoomed_whole.pdf')

# T-test (Comparisons between the models)

In [22]:
bleu_df_filtered = bleu_df[~bleu_df['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_bleu = bleu_df_filtered.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu.columns = ['model', 'mean', 'std']

In [23]:
agg_bleu = agg_bleu.sort_values('mean', ascending=False).reset_index(drop=True)
agg_bleu


Unnamed: 0,model,mean,std
0,gpt_complex,0.277885,0.349838
1,claude_complex,0.253559,0.369379
2,gpt_refine_complex,0.241407,0.291929
3,claude_two_example,0.198892,0.351342
4,claude_refine_complex,0.194788,0.297527
5,gpt_refine,0.159854,0.234093
6,gpt_simple,0.154906,0.229088
7,claude_refine,0.118437,0.220408
8,claude_one_example,0.114732,0.272312
9,gpt_two_example,0.112699,0.099706


In [25]:
cer_df_filtered = cer_df[~cer_df['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_cer = cer_df_filtered.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer.columns = ['model', 'mean', 'std']
agg_cer = agg_cer.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer

Unnamed: 0,model,mean,std
0,gpt_complex,0.242133,0.191534
1,gpt_refine_complex,0.252526,0.189149
2,claude_complex,0.288068,0.221569
3,gpt_refine,0.429612,0.479808
4,gpt_simple,0.452401,0.496205
5,claude_refine,0.557395,0.584668
6,claude_two_example,0.571744,1.713213
7,claude_one_example,0.58514,1.022375
8,claude_refine_complex,0.602674,3.81754
9,TrOCR50,0.616124,0.830115


In [26]:
import scipy.stats as stats
import itertools

In [27]:
def ttest(score_df, models, score = 'bleu', ind = 'id'):

    paired_t = []

    # for model1, model2 in itertools.pairwise(models): #if you want all pairs with respect to the order
    for model1, model2 in itertools.combinations(models, 2): #if you want all pairs
        t1 = score_df[score_df['model'] == model1].set_index(ind)[score]
        t2 = score_df[score_df['model'] == model2].set_index(ind)[score]

        # Because the outputs of the 1 or 2 example prompts have different lengths as the others.
        paired_scores = t1.align(t2, join='inner')
        
        t_stat, p_value = stats.ttest_rel(paired_scores[0], paired_scores[1])
        paired_t.append({'Model 1': model1, 'Model 2': model2, 't-statistic': t_stat, 'p-value': p_value})

        paired_t_df = pd.DataFrame(paired_t)
        paired_t_df['significance'] = paired_t_df['p-value'].apply(lambda x: 'Significant' if x < 0.05 else 'Not Significant')

    return paired_t_df

In [28]:
models_bleu = agg_bleu['model'].unique() # Already sorted by mean
models_cer = agg_cer['model'].unique() # Already sorted by mean

In [29]:
paired_t_bleu = ttest(bleu_df, models_bleu, score = 'bleu', ind='id')
paired_t_cer = ttest(cer_df, models_cer, score = 'cer', ind='id')

In [30]:
paired_t_bleu

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_complex,claude_complex,1.307240,1.921961e-01,Not Significant
1,gpt_complex,gpt_refine_complex,5.127276,5.469902e-07,Significant
2,gpt_complex,claude_two_example,3.376621,8.377496e-04,Significant
3,gpt_complex,claude_refine_complex,4.501119,9.898492e-06,Significant
4,gpt_complex,gpt_refine,6.914553,3.140045e-11,Significant
...,...,...,...,...,...
148,KerasOCR,TrOCR,,,Not Significant
149,KerasOCR,EasyOCR,,,Not Significant
150,gpt_one_example,TrOCR,,,Not Significant
151,gpt_one_example,EasyOCR,,,Not Significant


In [31]:
paired_t_cer

Unnamed: 0,Model 1,Model 2,t-statistic,p-value,significance
0,gpt_complex,gpt_refine_complex,-3.300705,1.088520e-03,Significant
1,gpt_complex,claude_complex,-6.109194,3.315094e-09,Significant
2,gpt_complex,gpt_refine,-6.405995,6.239164e-10,Significant
3,gpt_complex,gpt_simple,-6.935243,2.771370e-11,Significant
4,gpt_complex,claude_refine,-8.794849,1.451432e-16,Significant
...,...,...,...,...,...
148,Pytesseract,gpt_two_example,-4.287111,2.493251e-05,Significant
149,Pytesseract,claude_simple,-1.992363,4.729516e-02,Significant
150,TrOCR,gpt_two_example,-1.143563,2.537815e-01,Not Significant
151,TrOCR,claude_simple,-1.602616,1.101390e-01,Not Significant


In [72]:
# sig = paired_t_df[paired_t_df['p-value'] < 0.05].sort_values('p-value') 
# paired t-test is based on: Model1 - Model2 
# if Model1 - Model2 > 0, t is larger, p-value is small.

In [73]:
# pd.crosstab(sig['Model 1'], sig['Model 2'])

Model 2,TrOCR,TrOCR20,claude_refine,claude_two_example
Model 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claude_complex,0,0,0,1
claude_simple,0,1,0,0
gpt_simple,0,0,1,0
pytesseractOCR,1,0,0,0


In [33]:
# paired_t_bleu.to_csv(path + '/results/scores_comparisons/paired_t-test_bleu_allcombi_perline.csv', index=False)

## Whole Scans

In [40]:
bleu_df_whole_filtered = bleu_df_whole[~bleu_df_whole['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_bleu_whole = bleu_df_whole_filtered.groupby('model').agg({'bleu': ['mean', 'std']}).reset_index()
agg_bleu_whole.columns = ['model', 'mean', 'std']
agg_bleu_whole = agg_bleu_whole.sort_values('mean', ascending=False).reset_index(drop=True) #ascending=True because lower bleu means better transcribed.
agg_bleu_whole

Unnamed: 0,model,mean,std
0,gpt_two_example,0.296086,0.055355
1,claude_two_example,0.266913,0.050648
2,claude_complex,0.253886,0.063031
3,gpt_one_example,0.244748,0.130801
4,claude_one_example,0.240243,0.057789
5,gpt_refine_complex,0.163442,0.071983
6,gpt_complex,0.157995,0.078886
7,claude_refine_complex,0.151734,0.10701
8,claude_simple,0.069342,0.036418
9,gpt_simple,0.00296,0.010921


In [41]:
cer_df_whole_filtered = cer_df_whole[~cer_df_whole['model'].isin(['gpt_one_text_example', 'gpt_two_text_example', 'claude_one_text_example', 'claude_two_text_example'])]
agg_cer_whole = cer_df_whole_filtered.groupby('model').agg({'cer': ['mean', 'std']}).reset_index()
agg_cer_whole.columns = ['model', 'mean', 'std']
agg_cer_whole = agg_cer_whole.sort_values('mean', ascending=True).reset_index(drop=True) #ascending=True because lower CER means better transcribed.
agg_cer_whole

Unnamed: 0,model,mean,std
0,claude_two_example,0.43403,0.032723
1,claude_one_example,0.494389,0.112166
2,gpt_two_example,0.641059,0.313478
3,gpt_one_example,0.653518,0.177353
4,claude_complex,0.683409,0.111397
5,claude_refine_complex,0.757687,0.245322
6,TrOCR20,0.760872,0.010756
7,KerasOCR,0.782537,0.007306
8,Pytesseract,0.887165,0.056133
9,TrOCR,0.964751,0.016594


In [42]:
models_bleu_whole = agg_bleu_whole['model'].unique() # Already sorted by mean
models_cer_whole = agg_cer_whole['model'].unique() # Already sorted by mean
paired_t_bleu_whole = ttest(bleu_df_whole, models_bleu_whole, score = 'bleu', ind='file')
paired_t_cer_whole = ttest(cer_df_whole, models_cer_whole, score = 'cer', ind='file')

In [45]:
paired_t_cer_whole.to_csv(path + '/results/scores_comparisons/paired_t-test_cer_allcombi_whole.csv', index=False)