## Image to Text

**Popular free tools**
- Teserract
- Easy-OCR
- Keras-OCR


In [None]:
# Using Teserract
from pytesseract import image_to_string
import numpy as np
import pandas as pd
import re

text = image_to_string("data.png", lang="eng", config = r'--oem 3 --psm 6')
lines = text.split("\n")

header = lines[0].split()
data = [line for line in lines[1:] if re.match(r"\d{5}", line.strip())]

def clean_data(string) :
    pattern = r'^(\d+)\s+(.+?)\s+([A-Za-z]+)\s+(Q\d\d{2}|FY\s+\d{2})\s+(\d{2}-\d{2}-\d{4})\s+(\d{2}-\d{2}-\d{4})\s+(\d{2}-\d{2}-\d{4})$'
    result = re.match(pattern, string)
    return result.groups() if result else [np.nan] * 7

df = pd.DataFrame(map(clean_data, data), columns = header).dropna()
df.head()

In [None]:
text = image_to_string("data5.png", lang="eng", config = r'--oem 3 --psm 6')
lines = text.split("\n")
lines

In [None]:
# Using easy-ocr
from easyocr import Reader
import pandas as pd
import numpy as np

reader = Reader(['en'])
text = reader.readtext("data.png", detail=0)

data = np.append(text[3:], np.ones(5))
df = pd.DataFrame(np.reshape(data, (24, 6)), columns=['A', 'Ticker', "PeriodName", "PeriodEndDate", "FirstFillingDate", 'LatestFillingDate'])
df = df.iloc[:-2]
df.loc[len(df)] = ['10688 Meta Platforms, Inc.', 'META', '03 11', '30-09-2011', '15-10-2011', '15-10-2011']
df[['COID', 'CoName']] = df['A'].str.split(r'\d\s', regex=True, expand=True)
df = df[['COID', 'CoName', 'Ticker', 'PeriodName', 'PeriodEndDate', 'FirstFillingDate', 'LatestFillingDate']]
df.head(2)

In [None]:
import cv2
import numpy as np
from easyocr import Reader
from PIL import Image

# Step 1: Load and preprocess image
img = cv2.imread("data5.png")

# Convert to grayscale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Resize image (improves OCR for small text)
scale_percent = 150  # scale by 150%
width = int(gray.shape[1] * scale_percent / 100)
height = int(gray.shape[0] * scale_percent / 100)
resized = cv2.resize(gray, (width, height), interpolation=cv2.INTER_LINEAR)

# Apply thresholding (binarization)
_, thresh = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Optional: Denoising
denoised = cv2.fastNlMeansDenoising(thresh, h=30)

# Save temporary preprocessed image
temp_path = "preprocessed.png"
cv2.imwrite(temp_path, denoised)

# Step 2: OCR using EasyOCR
reader = Reader(['en'], gpu=False)  # set gpu=True if supported
results = reader.readtext(temp_path, detail=0)

results

In [None]:
# keras_ocr
import keras_ocr
import numpy as np
import pandas as pd
print(1)
# Pipeline
pipeline = keras_ocr.pipeline.Pipeline()
print(2)

# Read image
image = keras_ocr.tools.read("data.png")
prediction_groups = pipeline.recognize([image])

data = prediction_groups[0]

sorted(data, key = lambda val : list(val[1][[0, 3]].flatten()))

# Extract text only
text = [text[0] for text in sorted(data, key = lambda val : list(val[1][[0, 3]].flatten()))]
print("-".join(text))