In [None]:
# Structured pdf
import pdfplumber
import pandas as pd
import re

df = pd.DataFrame()
with pdfplumber.open(r"data_pdf_1.pdf") as pdf:
    for page in pdf.pages:
        df = pd.concat((df, pd.DataFrame(page.extract_table())), ignore_index=True)

df.columns = df.iloc[0]   
df = df[1:].reset_index(drop=True) 
df.head()

In [None]:
# Unstructured pdf
import re
import pandas as pd
import pdfplumber

pdf = pdfplumber.open(r"data_pdf_2.pdf")
lines = []
for page in pdf.pages :
    lines.extend(page.extract_text().split("\n"))
header = lines[0].split()
data = [line for line in lines[1:] if re.match(r"\d{5}", line.strip())]

# Using basic list and str handling
def clean_data(string) :
    parts = string.split()
    return [parts[0], " ".join(parts[1:4]), parts[4], " ".join(parts[5:7]), *parts[7:]]

# Using regular expression
def clean_data_re(string) :
    pattern = r'^(\d+)\s+(.+?)\s+([A-Za-z]+)\s+(Q\d\s+\d{2}|FY\s+\d{2})\s+(\d{2}-\d{2}-\d{4})\s+(\d{2}-\d{2}-\d{4})\s+(\d{2}-\d{2}-\d{4})$'
    result = re.match(pattern, string)
    return result.groups() if result else [np.nan] * 7

df = pd.DataFrame(map(clean_data, data), columns = header)
# df = pd.DataFrame(map(clean_data_re, data), columns = header)
df.head()

In [1]:
# Scanned PDF - Teserract
import pandas as pd
import re

from pdf2image import convert_from_path
import pytesseract

# Path to Tesseract executable (adjust this)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Convert PDF to images
pdf_path = "data_pdf_4.pdf" #"data_pdf_4.pdf" # "data_pdf_3.pdf"
images = convert_from_path(pdf_path, dpi=300)


# Extract text from each image
for page_num, img in enumerate(images):
    text = pytesseract.image_to_string(img)
    print(text)


COID CoName

10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.
10688 Meta Platforms, Inc.

Ticker PeriodName PeriodEndDate FirstFillingDate LatestFillingDate

META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META
META

Q107
Q2 07
Q3 07
Q4 07
FY 07
Q1 08
Q2 08
Q3 08
Q4 08
FY 08
Q1 09
Q2 09
Q3 09
Q4 09
FY 09
Q1 10
Q2 10
Q3 10
Q4 10
FY 10
Qiil
Q2 11
0311

31-03-2007
30-06-2007
30-09-2007
31-12-2007
3

In [None]:
# Scanned PDF - Easy-OCR
import pandas as pd
import re

from pdf2image import convert_from_path
from easyocr import Reader

# Convert PDF to images
pdf_path = "data_pdf_4.pdf" #"data_pdf_4.pdf" # "data_pdf_3.pdf"
images = convert_from_path(pdf_path, dpi=300)
reader = Reader(['en'])

# Extract text from each image
for page_num, img in enumerate(images):
    img.save(f'temp.png', 'PNG')
    text = reader.readtext("temp.png", detail=0)
    print(text)


In [3]:
# Scanned PDF - Keras-OCR
import pandas as pd
import re

from pdf2image import convert_from_path
import keras_ocr

pipeline = keras_ocr.pipeline.Pipeline()

# Convert PDF to images
pdf_path = "data_pdf_5.pdf" #"data_pdf_4.pdf" # "data_pdf_3.pdf"
images = convert_from_path(pdf_path, dpi=300)

# Extract text from each image
for page_num, img in enumerate(images):
    img.save(f'temp.png', 'PNG')
    # Read image
    image = keras_ocr.tools.read("temp.png")
    prediction_groups = pipeline.recognize([image])

    text = [word for word, box in prediction_groups[0]]
    print(text)


Looking for C:\Users\vaide\.keras-ocr\craft_mlt_25k.h5
Looking for C:\Users\vaide\.keras-ocr\crnn_kurapan.h5
['amows', 'millionss', '103', '203', '502', '1025', '2025', '104', '2024', '5024', 'so2a', '2024', '1025', '2a2se', '302se', 'sazse', '202se', '2026e', 'share', 'ovlal', 'd', 'ereps', 'eer', 'adhortising', 'rotenue', 'sss', 'saog', 'ssoa', 'ss76', 'slgal', 'sot', 'st11', 'sas', 's9ea', 'sipa', 's11s9', 'sixs', 's1255', 'slaos', 'sagna', 'sq175', 's8l', 'sjaa', 'sje0', '87', 'slaa', 's1bd', 's3', 'slass', '825', '525', 'apes', 'reenue', 's369', '565', 'sts0', 'sliss', 'siet', 'siass', 'sszag', 'seulze', 'tatol', 'rotnus', 'st1s', 'shsa', 's954', 'szhs', 'sldss', 'slded', 'sajos', 'sizos', 'sipss', 'slfos', '156', '5267', 'cost', 'of', 'gaap', '5760', 'szl', 'sloag', '5241', 's700', 's761', 's41s', 'slies6', 's769', 's180', 's185', 'siad', 's6la', 'ss69', 'rerenus', 'non', '5156', 'sig', 'scoz', 'sl6m', 'sto7', 'sboo', 'sloss', 's1555', 'slz1', 'slots', 'slizd', '1165', 'sosta', '