In [1]:
import os
import re
import pandas as pd
import requests
import base64
import subprocess
from IPython.display import display, Image
from PIL import Image as PILImage

In [2]:
import keras_ocr

In [None]:
path = os.path.dirname(os.getcwd()) # Parent directory
image_folder = path+'/data/lines'

# Enocde Images

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [None]:
images = []
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        images.append(image)

rows = []
for image in images:
    name = image.split('.')[0]
    name_split = name.split('_')[0]
    file_name = name_split.split('example')[1]
    line_name = name.split('_')[1]
    encoded_value = encode_image(image_folder+'/'+image)
    rows.append({'file': file_name, 'line': line_name, 'encoded': encoded_value})

images_encoded = pd.DataFrame(rows)

In [None]:
images_encoded['file'] = images_encoded['file'].astype('int')
images_encoded['line'] = images_encoded['line'].astype('int')
images_encoded = images_encoded.sort_values(by=['file', 'line']).reset_index(drop=True)
images_encoded['id'] = images_encoded['file'].astype(str) + '_' + images_encoded['line'].astype(str)
images_encoded.head(30)

# Keras
* Please check if the code is good

In [None]:
def kerasOCR(image_path):
    pipeline = keras_ocr.pipeline.Pipeline()
    image = keras_ocr.tools.read(image_path)
    prediction_groups = pipeline.recognize([image])
    words = []
    for line in prediction_groups[0]:
        for word in line:
            try:
                if isinstance(word[0], str):
                    words.append(word[0])
            except IndexError:
                continue
    text = ' '.join(words)
    return text

In [None]:
kerasOCR_output = {}
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        image_path = image_folder + '/' + image
        text = kerasOCR(image_path)
        name = image.split('.')[0]
        name = name.split('example')[1]
        kerasOCR_output[name] = text

In [None]:
kerasOCR_output_df = pd.DataFrame(kerasOCR_output.items(), columns=['file', 'text'])
kerasOCR_output_df[['file_name', 'line_name']] = kerasOCR_output_df['file'].str.split('_', expand=True)
kerasOCR_output_df[['file_name', 'line_name']] = kerasOCR_output_df[['file_name', 'line_name']].astype(int)
kerasOCR_output_df = kerasOCR_output_df.sort_values(by=['file_name', 'line_name']).reset_index(drop=True)
kerasOCR_output_df['text'] = kerasOCR_output_df['text'].replace(['\n', '\t'], ' ', regex=True)
kerasOCR_output_df['id'] = kerasOCR_output_df['file_name'].astype(str) + '_' + kerasOCR_output_df['line_name'].astype(str)
kerasOCR_output_df

In [None]:
kerasOCR_output_df.to_csv(path+'/results/postprocessed/kerasOCR_perline_output.csv', index=False)

# Calculate Scores

In [None]:
df = pd.read_csv(path+'/data/transcription_perline_text.csv')

In [None]:
from evaluate import load

cer_metric =load("cer")
bleu_metric = load("bleu")  

In [None]:
kerasOcr = pd.read_csv(path+'/results/postprocessed/kerasOCR_perline_output.csv')

In [None]:
bleu_gpt = {}
cer_gpt = {}

for id in df['id'].unique():
    # Extract the text as a single string, not as an array
    pred_text = kerasOCR[kerasOcr['id'] == id]['text'].values[0] #or kerasOCR_output_df
    ref_text = df[df['id'] == id]['text'].values[0]

    # Ensure the predictions and references are passed as a list of strings
    if pred_text and ref_text:  # Check if both texts are not empty (which happens for some OCR outputs)
        bleu_gpt[id] = bleu_metric.compute(predictions=[pred_text], references=[ref_text])
    else:
        bleu_gpt[id] = {'bleu': 0.0}  # Assign a default value if texts are empty
    cer_gpt[id] = cer_metric.compute(predictions=[pred_text], references=[ref_text])

In [None]:
bleu_gpt = pd.DataFrame(bleu_gpt).T
print(bleu_gpt['bleu'].mean(), bleu_gpt['bleu'].var())

In [None]:
bleu_gpt['id'] = bleu_gpt.index
bleu_gpt

In [None]:
bleu_gpt['file'] = bleu_gpt['id'].str.split('_').str[0].astype(int)
bleu_gpt.groupby('file')['bleu'].agg(['mean', 'var'])

In [None]:
bleu_gpt.to_csv(path+'/results/scores_comparisons/eval/bleu_kerasOCR_perline.csv')

In [None]:
cer_gpt = pd.DataFrame(cer_gpt.items(), columns=['id', 'cer'])

In [None]:
print(cer_gpt['cer'].mean(), cer_gpt['cer'].var())

In [None]:
cer_gpt.to_csv(path+'/results/scores_comparisons/eval/cer_kerasOCR_perline.csv')

In [None]:
cer_gpt['file'] = cer_gpt['id'].str.split('_').str[0].astype(int)
cer_gpt.groupby('file')['cer'].agg(['mean', 'var'])