First the text is extracted from the unethical dataset.

In [1]:
from torchmetrics.text import CharErrorRate, WordErrorRate
import pandas as pd
import time

df = pd.read_csv("memotion_dataset_7k/labels.csv")
image_names = df["image_name"][:500].tolist()
text = df["text_corrected"][:500].tolist()
wer = WordErrorRate()
cer = CharErrorRate()

The results for EasyOCR on the dataset:

In [3]:
import easyocr
import cv2 as cv

start_time = time.time()
reader = easyocr.Reader(["en"])
predictions = []

for image_name in image_names:
    image_path = "memotion_dataset_7k/images/" + image_name
    image = cv.imread(image_path, 0)
    result = " ".join(reader.readtext(image, detail=0))
    predictions.append(result)

predictions = [prediction.lower() for prediction in predictions]
text = [txt.lower() for txt in text]
print(f"WER: {round(wer(predictions, text).item(), 2)}")
print(f"CER: {round(cer(predictions, text).item(), 2)}")
print(f"The execution took {(time.time() - start_time):.2f} seconds")



WER: 0.37
CER: 0.18
The execution took 67.72 seconds


The results for Tesseract OCR on the dataset:

In [2]:
import pytesseract
import  cv2 as cv

start_time = time.time()
predictions = []

for image_name in image_names:
    image_path = "memotion_dataset_7k/images/" + image_name
    image = cv.imread(image_path, 0)
    result = pytesseract.image_to_string(image, config="--oem 3 --psm 11")
    predictions.append(result)

predictions = [prediction.lower() for prediction in predictions]
text = [txt.lower() for txt in text]
print(f"WER: {round(wer(predictions, text).item(), 2)}")
print(f"CER: {round(cer(predictions, text).item(), 2)}")
print(f"The execution took {(time.time() - start_time):.2f} seconds")



WER: 0.64
CER: 0.48
The execution took 103.99 seconds


The results for PaddleOCR on the dataset:

In [2]:
from paddleocr import PaddleOCR

start_time = time.time()
ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False, ocr_version="PP-OCRv4", use_space_char=True, use_dilation=True)
predictions = []

for image_name in image_names:
    image_path = "memotion_dataset_7k/images/" + image_name
    result = ocr.ocr(image_path, cls=False)
    if result[0] != None:
        result = " ".join([line[1][0] for res in result for line in res])
    else:
        result = ""
    predictions.append(result)

predictions = [prediction.lower() for prediction in predictions]
text = [txt.lower() for txt in text]
print(f"WER: {round(wer(predictions, text).item(), 2)}")
print(f"CER: {round(cer(predictions, text).item(), 2)}")
print(f"The execution took {(time.time() - start_time):.2f} seconds")





WER: 0.36
CER: 0.15
The execution took 31.11 seconds


The results for GOT OCR on the dataset:

In [2]:
from transformers import AutoModel, AutoTokenizer
from transformers import logging

start_time = time.time()
logging.set_verbosity_error() 

tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()
predictions = []

for image_name in image_names:
    image_path = "memotion_dataset_7k/images/" + image_name
    result = model.chat(tokenizer, image_path, ocr_type="ocr")
    predictions.append(result)

predictions = [prediction.lower() for prediction in predictions]
text = [txt.lower() for txt in text]
print(f"WER: {round(wer(predictions, text).item(), 2)}")
print(f"CER: {round(cer(predictions, text).item(), 2)}")
print(f"The execution took {(time.time() - start_time):.2f} seconds")

WER: 0.37
CER: 0.16
The execution took 728.33 seconds


In [2]:
from google.cloud import vision

start_time = time.time()
client = vision.ImageAnnotatorClient()

predictions = []

for image_name in image_names:
    image_path = "memotion_dataset_7k/images/" + image_name
    with open(image_path, "rb") as image_file:
        content = image_file.read()

        image = vision.Image(content=content)

        response = client.text_detection(image=image)

        if response.error.message:
            raise Exception(
                "{}\nFor more info on error messages, check: "
                "https://cloud.google.com/apis/design/errors".format(response.error.message)
            )
        
        texts = response.text_annotations
        result = texts[0].description.replace("\n", " ")
        predictions.append(result)

predictions = [prediction.lower() for prediction in predictions]
text = [txt.lower() for txt in text]
print(f"WER: {round(wer(predictions, text).item(), 2)}")
print(f"CER: {round(cer(predictions, text).item(), 2)}")
print(f"The execution took {(time.time() - start_time):.2f} seconds")

WER: 0.16
CER: 0.11
The execution took 80.68 seconds
