In [1]:
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import csv

In [2]:
# Setting up the pytesseract executable path.
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
# Function for PDF extraction
def extract_pdf(file):
    images = convert_from_path(file, dpi=300)  # Convert PDF pages to images
    text = []
    for image in images:
        # Preprocess image for better OCR results
        preprocessed_image = preprocess_image(np.array(image))
        extracted_text = pytesseract.image_to_string(preprocessed_image)
        text.append(extracted_text)
    return "\n".join(text)

In [4]:
# Function for image extraction (png, jpg, jpeg files)
def extract_img(file):
    image = cv2.imread(file)
    # Preprocess image for better OCR results
    preprocessed_image = preprocess_image(image)
    text = pytesseract.image_to_string(preprocessed_image)
    return text

In [5]:
# Image preprocessing function
def preprocess_image(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply adaptive thresholding
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    
    # Apply dilation
    kernel = np.ones((1, 1), np.uint8)
    dilated = cv2.dilate(binary, kernel, iterations=1)

    # Apply noise reduction
    blur = cv2.GaussianBlur(dilated, (5, 5), 0)
    
    # Skew correction
    coords = np.column_stack(np.where(blur > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(blur, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    return rotated

In [6]:
# Depending on the file type, call the appropriate function and print the extracted text
file = "Sample Files/sample3.jpg"
if file.endswith('.pdf'):
    text = extract_pdf(file)
    print(text)
elif file.endswith(('.png', '.jpg', '.jpeg')):
    text = extract_img(file)
    print(text)
else:
    print("Invalid file format. Please provide a JPEG, PNG or PDF file.")

NS S816 jt 1:
Serve as a dal:

1 be. The wore
—epeating ther

has had a ver.

member, John.
read:

$$ ie, 1te

oak tell
youl
, fd

y c

You

——_— yj il

aa vel

In that instant, | saw the connection between physical vitality
and mental agility, Julian was in picture-perfect health and looked
many years younger than he had when we had first met. He
brimmed with vibrancy and it appeared that his energy, enthus‘asm
and optimism knew no bounds. | could see that he had made many
changes to his former lifestyle, but it was obvious that the starting
point of his magnificent transformation was mental fitness. Success
on the outside indeed begins with success on the inside, and by
changing his thoughts, Julian Mantle had changed his life.

“Exactly how can I develop this positive, serene and inspired
attitude, Julian? After all these years in my routine, I think my
mental muscles have grown a little flabby. Come to think of it, I
have very little control over the thoughts that are floating arou

In [7]:
# Save the extracted text to a CSV file
output_file = "extracted_data.csv"
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Extracted Text'])
    writer.writerow([text])

print(f"Extracted data saved to {output_file} successfully.")

Extracted data saved to extracted_data.csv successfully.
