In [None]:
!pip install pdf2image

In [None]:
%%time
from pdf2image import convert_from_path
import os

# Path to the folder containing PDF files
pdf_folder = "dataset"

# Output folder to save images
output_parent_folder = "data"

# Path to the Poppler executable
poppler_path = r"C:/poppler-24.02.0/Library/bin"


# Create the output parent folder if it doesn't exist
os.makedirs(output_parent_folder, exist_ok=True)

# Iterate over each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        # Create a folder with the PDF file's name
        pdf_name = os.path.splitext(pdf_file)[0]
        output_folder = os.path.join(output_parent_folder, pdf_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Convert only the first page of the PDF to an image
        images = convert_from_path(
            os.path.join(pdf_folder, pdf_file),
            poppler_path=poppler_path,
            first_page=1, last_page=1  # Extract only the first page
        )
        
        # Save the image in the output folder
        images[0].save(os.path.join(output_folder, f"page_1.png"), "PNG", quality=100)

print("First pages saved successfully.")


In [None]:
from PIL import Image, ImageDraw

def draw_rectangle_on_image(image_path, output_path, start_y, end_y):
    # Open the input image
    img = Image.open(image_path)
    
    # Create a drawing object
    draw = ImageDraw.Draw(img)
    
    # Get image width and height
    width, height = img.size
    
    # Define the coordinates for the rectangle
    # Rectangle spans the full width of the image, but with a custom height
    left = 0
    upper = start_y  # Starting y-coordinate for the rectangle
    right = width
    lower = end_y    # Ending y-coordinate for the rectangle
    
    # Draw the rectangle on the image
    draw.rectangle([left, upper, right, lower], outline="red", width=3)
    
    # Save the modified image with the rectangle drawn
    img.save(output_path)

if __name__ == "__main__":
    # Specify the input image path
    input_path = "page_1.png"
    
    # Specify the output image path
    output_path = "image_with_rectangle.png"
    
    # Define the start and end y-coordinates for the rectangle
    start_y = 800  # Adjust this value based on where you want the rectangle to start
    end_y = 1300   # Adjust this value based on where you want the rectangle to end
    
    # Call the function to draw the rectangle on the image
    draw_rectangle_on_image(input_path, output_path, start_y, end_y)


In [None]:
from PIL import Image, ImageDraw

def crop_and_save_region(image_path, output_path, start_y, end_y):
    # Open the input image
    img = Image.open(image_path)
    
    # Get image width and height
    width, height = img.size
    
    # Define the coordinates for the rectangle
    # Rectangle spans the full width of the image, but with a custom height
    left = 0
    upper = start_y  # Starting y-coordinate for the rectangle
    right = width
    lower = end_y    # Ending y-coordinate for the rectangle
    
    # Crop the region defined by the rectangle coordinates
    cropped_img = img.crop((left, upper, right, lower))
    
    # Save the cropped region as PNG
    cropped_img.save(output_path)

if __name__ == "__main__":
    # Specify the input image path
    input_path = "page_1.png"
    
    # Specify the output image path for the cropped region
    output_path = "cropped_region.png"
    
    # Define the start and end y-coordinates for the rectangle
    start_y = 800  # Adjust this value based on where you want the rectangle to start
    end_y = 1300   # Adjust this value based on where you want the rectangle to end
    
    # Call the function to crop and save the specified region
    crop_and_save_region(input_path, output_path, start_y, end_y)


In [None]:
%%time
import os
from PIL import Image

def crop_and_save_region(image_path, output_path, start_y, end_y):
    # Open the input image
    img = Image.open(image_path)
    
    # Get image width and height
    width, height = img.size
    
    # Define the coordinates for the rectangle
    left = 0
    upper = start_y  # Starting y-coordinate for the rectangle
    right = width
    lower = end_y    # Ending y-coordinate for the rectangle
    
    # Crop the region defined by the rectangle coordinates
    cropped_img = img.crop((left, upper, right, lower))
    
    # Save the cropped region as PNG
    cropped_img.save(output_path)

def process_images_in_folder(input_folder, output_folder, start_y, end_y):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Recursively process each file and folder in the input folder
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith((".jpg", ".jpeg", ".png", ".gif")):  # Process image files
                input_file_path = os.path.join(root, file)
                
                # Determine output subfolder path within croppedimages folder
                relative_path = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_folder, relative_path)
                os.makedirs(output_subfolder, exist_ok=True)
                
                # Define output file path
                output_file_path = os.path.join(output_subfolder, file)
                
                # Crop and save the image
                crop_and_save_region(input_file_path, output_file_path, start_y, end_y)

if __name__ == "__main__":
    # Specify the input folder containing subfolders with images
    input_folder = "data"
    
    # Specify the output folder for cropped images
    output_folder = "croppedimages"
    
    # Define the start and end y-coordinates for the rectangle
    start_y = 800  # Adjust this value based on where you want the rectangle to start
    end_y = 1300   # Adjust this value based on where you want the rectangle to end
    
    # Process images in the input folder and save cropped images to output folder
    process_images_in_folder(input_folder, output_folder, start_y, end_y)


In [None]:
import os
from PIL import Image

def crop_and_save_region(image_path, output_folder, start_y, end_y, filename_prefix):
    # Open the input image
    img = Image.open(image_path)
    
    # Get image width and height
    width, height = img.size
    
    # Define the coordinates for the rectangle
    left = 0
    upper = start_y  # Starting y-coordinate for the rectangle
    right = width
    lower = end_y    # Ending y-coordinate for the rectangle
    
    # Crop the region defined by the rectangle coordinates
    cropped_img = img.crop((left, upper, right, lower))
    
    # Determine output filename
    _, filename = os.path.split(image_path)
    output_filename = f"{filename_prefix}_{filename}"
    
    # Save the cropped image to the output folder
    output_path = os.path.join(output_folder, output_filename)
    cropped_img.save(output_path)

def process_images_in_folder(input_folder, output_folder, start_y, end_y):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Iterate over each file in the input folder
    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.endswith((".jpg", ".jpeg", ".png", ".gif")):  # Process image files
                input_file_path = os.path.join(root, file)
                
                # Get the parent folder name as filename prefix
                parent_folder = os.path.basename(root)
                
                # Define a unique filename prefix based on the parent folder name
                filename_prefix = parent_folder.lower().replace(" ", "")  # Convert to lowercase and remove spaces
                
                # Crop and save the image with the unique filename
                crop_and_save_region(input_file_path, output_folder, start_y, end_y, filename_prefix)

if __name__ == "__main__":
    # Specify the input folder containing subfolders with images
    input_folder = "error"
    
    # Specify the output folder for cropped images
    output_folder = "croppedimages"
    
    # Define the start and end y-coordinates for the rectangle
    start_y = 800  # Adjust this value based on where you want the rectangle to start
    end_y = 1730 # Adjust this value based on where you want the rectangle to end
    
    # Process images in the input folder and save cropped images to output folder
    process_images_in_folder(input_folder, output_folder, start_y, end_y)


In [None]:
!pip install ocr-tamil

In [None]:
from ocr_tamil.ocr import OCR

# Specify the path to your image file
image_path = r"ac223001_page_1.png"  # Insert your own image path here

# Initialize the OCR object
ocr = OCR(detect=True)  # Assuming 'detect' is a parameter for text detection

# Use OCR to predict text from the image
texts = ocr.predict(image_path)

# Check if any text was detected and print the results
if texts:
    # Assuming text_list is the variable name, using 'texts' instead
    print("Extracted Text:")
    for text in texts:
        print(text)
else:
    print("No text detected.")


In [70]:
from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"okok.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    print(" ".join(text))


எண் மற்றும் பெயர் முக்கிய நகரம்|கிராமம் : கீழப்பாவூர்% அஞ்சல் அலுவலகம் : கீழப்பாவூர் காவல் நிலையம் : பாவூர்சத்திரம் பஞ்சாயத்து வட்டம் ஆலங்குளம் கோட்டம் தென்காசி மாவட்டம் : தென்காசி அஞ்சல் குறியீட்டு எண் : 627806


In [69]:
from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"okok2.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    print(" ".join(text))


OT மற்றும் பெயர் முக்கிய நகரம்|கிராமம் : கீழப்பாவூர்% அஞ்சல் அலுவலகம் : கீழப்பாவூர் காவல் நிலையம் ஃபாவூர்சத்திரம் பஞ்சாயத்து வட்டம் ஆலங்குளம் கோட்டம் தென்காசி மாவட்டம் தென்காசி அஞ்சல் குறியீட்டு எண் : 627806


In [62]:
from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"okok2.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    # print(" ".join(text))
    data=" ".join(text)


data


'OT மற்றும் பெயர் முக்கிய நகரம்|கிராமம் : கீழப்பாவூர்% அஞ்சல் அலுவலகம் : கீழப்பாவூர் காவல் நிலையம் ஃபாவூர்சத்திரம் பஞ்சாயத்து வட்டம் ஆலங்குளம் கோட்டம் தென்காசி மாவட்டம் தென்காசி அஞ்சல் குறியீட்டு எண் : 627806'

In [60]:
type(data)

str

In [76]:
import re

def extract_first_word_after_keyphrase(text):
    # Define the keyphrase pattern
    pattern = r"முக்கிய நகரம்\|கிராமம்\s*:?\s*(\S+)"
    
    # Search for the pattern
    match = re.search(pattern, text)
    
    # If a match is found, return the first captured group, which is the word after the keyphrase
    if match:
        return match.group(1)
    else:
        return "No match found"

from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"okok3.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    # print(" ".join(text))
    data=" ".join(text)




print(extract_first_word_after_keyphrase(data))


அரசன்குளம்


In [2]:
import re

def extract_first_word_after_postoffice(text):
    # Define the keyphrase pattern to capture the word after "அஞ்சல் அலுவலகம்" or "அஞ்சல் அலுவலகம் :"
    pattern = r"அஞ்சல் அலுவலகம்\s*:?\s*(\S+)"
    
    # Search for the pattern
    match = re.search(pattern, text)
    
    # If a match is found, return the first captured group, which is the word after the keyphrase
    if match:
        return match.group(1)
    else:
        return "No match found"

# Example usage:
text_example = 'OT மற்றும் பெயர் முக்கிய நகரம்|கிராமம் : கீழப்பாவூர்% அஞ்சல் அலுவலகம் : கீழப்பாவூர் காவல் நிலையம் ஃபாவூர்சத்திரம் பஞ்சாயத்து வட்டம் ஆலங்குளம் கோட்டம் தென்காசி மாவட்டம் தென்காசி அஞ்சல் குறியீட்டு எண் : 627806'

print(extract_first_word_after_postoffice(text_example))


கீழப்பாவூர்


In [77]:
import os
import csv
import re
from ocr_tamil.ocr import OCR

def extract_first_word_after_keyphrase(text):
    # Define the keyphrase pattern
    pattern = r"முக்கிய நகரம்\|கிராமம்\s*:?\s*(\S+)"
    
    # Search for the pattern
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return "No match found"

# Initialize OCR object
ocr = OCR(detect=True)

# Define the directory containing images
image_dir = r"C:\Users\haric\OneDrive\Desktop\right_half"

# Prepare to save results in a CSV file
output_file = r"ocr_results.csv"
with open(output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Image Name', 'Extracted Text'])

    # Process each image in the directory
    for image_name in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_name)
        if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):  # check file extension
            # Predict text from the image
            text_list = ocr.predict([image_path])
            for text in text_list:
                extracted_text = " ".join(text)
                first_word = extract_first_word_after_keyphrase(extracted_text)
                writer.writerow([image_name, first_word])

print("OCR processing and CSV creation completed.")


OCR processing and CSV creation completed.


In [78]:
%%time
import os
import re
from ocr_tamil.ocr import OCR

def extract_first_word_after_keyphrase(text):
    # Define the keyphrase pattern
    pattern = r"முக்கிய நகரம்\|கிராமம்\s*:?\s*(\S+)"
    
    # Search for the pattern
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return "No match found"

# Initialize OCR object
ocr = OCR(detect=True)

# Directory containing images
image_dir = r"C:\Users\haric\OneDrive\Desktop\right_half"

# Path to save the results in a text file
output_file = r"ocr_results.txt"
with open(output_file, mode='w', encoding='utf-8') as file:
    # Process each image in the directory
    for image_name in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_name)
        if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):  # check file extension
            # Predict text from the image
            text_list = ocr.predict([image_path])
            for text in text_list:
                extracted_text = " ".join(text)
                first_word = extract_first_word_after_keyphrase(extracted_text)
                # Write to file
                file.write(f"{image_name}: {first_word}\n")

print("OCR processing and text file creation completed.")


OCR processing and text file creation completed.
CPU times: total: 2h 47min 30s
Wall time: 34min 32s


In [3]:
%%time
import os
import re
from ocr_tamil.ocr import OCR

def extract_first_word_after_postoffice(text):
    # Define the keyphrase pattern to capture the word after "அஞ்சல் அலுவலகம்" or "அஞ்சல் அலுவலகம் :"
    pattern = r"அஞ்சல் அலுவலகம்\s*:?\s*(\S+)"
    
    # Search for the pattern
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return "No match found"

# Initialize the OCR object
ocr = OCR(detect=True)

# Define the directory containing the images
image_dir = r"C:\Users\haric\OneDrive\Desktop\right_half"

# Define the output text file
output_file = r"C:\Users\haric\OneDrive\Desktop\postoffice_results.txt"

# Open the output file for writing
with open(output_file, 'w', encoding='utf-8') as file:
    # Process each image in the directory
    for image_name in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_name)
        if image_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            # Predict text from the image using OCR
            text_list = ocr.predict([image_path])
            for text in text_list:
                data = " ".join(text)
                extracted_word = extract_first_word_after_postoffice(data)
                # Write the image name and extracted word to the file
                file.write(f"{image_name}: {extracted_word}\n")

print("OCR processing and text file creation completed.")


OCR processing and text file creation completed.
CPU times: total: 2h 35min 48s
Wall time: 33min 9s


In [6]:
import csv

# Read boothmapping data from ocr_results.txt and create a mapping dictionary
mapping = {}
with open("ocr_results.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(": ")
        mapping[parts[0]] = parts[1]

# Read data.csv and map boothnumbers to their corresponding Tamil words
mapped_booth_names = []
with open("data.csv", "r", newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        booth_number = row["Bootnumber"]
        mapped_booth_name = mapping.get(booth_number + ".png", "None")
        mapped_booth_names.append(mapped_booth_name)

# Write mapped booth names to a new text file
with open("mapped_booth_names.txt", "w", encoding="utf-8") as f:
    for booth_name in mapped_booth_names:
        f.write(booth_name + "\n")


In [1]:
import csv

# Read boothmapping data from ocr_results.txt and create a mapping dictionary
mapping = {}
with open("ocr_results.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(": ")
        mapping[parts[0]] = parts[1]

# Read data.csv and map boothnumbers to their corresponding Tamil words
mapped_booth_names = []
with open("data.csv", "r", newline="") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        booth_number = row["Bootnumber"]
        mapped_booth_name = mapping.get(booth_number + ".png", "None")
        mapped_booth_names.append([mapped_booth_name])

# Write mapped booth names to a new CSV file
with open("mapped_booth_names.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerows(mapped_booth_names)


In [5]:
from collections import defaultdict

# Path to your data file
data_file = 'postoffice_results.txt'

# Dictionary to store PNG files grouped by label
label_to_pngs = defaultdict(list)

# Read the data file line by line
with open(data_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:
            parts = line.split(':')
            if len(parts) == 2:
                png_file = parts[0].strip()
                label = parts[1].strip()
                if label != 'None':  # Ignore entries with 'None' as the label
                    label_to_pngs[label].append(png_file)

# Print unique labels and corresponding PNG file names
for label, pngs in label_to_pngs.items():
    unique_pngs = list(set(pngs))  # Get unique PNG file names
    print(f"Label: {label}")
    for png_file in unique_pngs:
        print(f"- {png_file}")
    print()  # Print an empty line for separation


Label: கீழப்பாவூர்
- ac223009_page_1.png
- ac223046_page_1.png
- ac223039_page_1.png
- ac223021_page_1.png
- ac223023_page_1.png
- ac223005_page_1.png
- ac223036_page_1.png
- ac223028_page_1.png
- ac223037_page_1.png
- ac223044_page_1.png
- ac223052_page_1.png
- ac223030_page_1.png
- ac223024_page_1.png
- ac223031_page_1.png
- ac223049_page_1.png
- ac223008_page_1.png
- ac223050_page_1.png
- ac223001_page_1.png
- ac223006_page_1.png
- ac223032_page_1.png
- ac223043_page_1.png
- ac223027_page_1.png
- ac223033_page_1.png
- ac223035_page_1.png
- ac223029_page_1.png
- ac223048_page_1.png
- ac223019_page_1.png
- ac223015_page_1.png
- ac223010_page_1.png
- ac223014_page_1.png
- ac223026_page_1.png
- ac223012_page_1.png
- ac223016_page_1.png
- ac223018_page_1.png
- ac223045_page_1.png
- ac223051_page_1.png
- ac223047_page_1.png
- ac223011_page_1.png
- ac223013_page_1.png
- ac223022_page_1.png
- ac223053_page_1.png
- ac223002_page_1.png
- ac223017_page_1.png
- ac223004_page_1.png
- ac223003_pa

In [8]:
from collections import defaultdict
import re

# Path to your data file
data_file = 'ocr_results.txt'

# Dictionary to store PNG files grouped by label
label_to_pngs = defaultdict(list)

# Read the data file line by line
with open(data_file, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:
            parts = line.split(':')
            if len(parts) == 2:
                png_file = parts[0].strip()
                label = parts[1].strip()
                if label != 'No match found':  # Adjust based on what non-matching entries might be labeled as
                    label_to_pngs[label].append(png_file)

# Function to sort filenames numerically
def numerical_sort_key(filename):
    numbers = re.findall(r'\d+', filename)
    return [int(num) for num in numbers] if numbers else [filename]

# Print unique labels and corresponding PNG file names in ascending order
for label, pngs in label_to_pngs.items():
    unique_pngs = sorted(list(set(pngs)), key=numerical_sort_key)  # Sort PNG files numerically
    print(f"Label: {label}")
    for png_file in unique_pngs:
        print(f"- {png_file}")
    print()  # Print an empty line for separation


Label: கீழப்பாவூர்-1
- ac223001.png
- ac223002.png
- ac223003.png
- ac223004.png
- ac223005.png
- ac223006.png
- ac223008.png
- ac223010.png
- ac223011.png
- ac223012.png
- ac223013.png
- ac223014.png
- ac223015.png
- ac223016.png
- ac223017.png
- ac223020.png
- ac223021.png

Label: None
- ac223007.png
- ac223009.png
- ac223104.png
- ac223298.png
- ac223299.png
- ac223300.png
- ac223301.png
- ac223302.png
- ac223303.png
- ac223304.png
- ac223305.png
- ac223306.png
- ac223307.png
- ac223308.png
- ac223309.png
- ac223310.png

Label: கீழப்பாவூர்-2
- ac223018.png
- ac223019.png
- ac223022.png
- ac223023.png

Label: பெத்தநாடார்பட்டி-2
- ac223024.png
- ac223025.png

Label: பெத்தநாடார்பட்டி-1
- ac223026.png
- ac223027.png
- ac223028.png
- ac223029.png
- ac223030.png
- ac223031.png
- ac223032.png
- ac223033.png
- ac223050.png

Label: பூலாங்குளம்
- ac223034.png
- ac223035.png
- ac223036.png
- ac223037.png
- ac223038.png
- ac223039.png

Label: ஆண்டிபட்டி
- ac223040.png
- ac223041.png
- ac223042.

In [None]:
# import re
# from ocr_tamil.ocr import OCR

# # Define a list of image paths
# image_paths = [r"data.png"]  # Insert your own image paths here

# # Initialize the OCR object with text detection enabled
# ocr = OCR(detect=True)

# # Use OCR to predict text from each image in the list
# text_list = ocr.predict(image_paths)

# # Iterate through the text_list and print the extracted text from each image
# for text in text_list:
#     text = " ".join(text)
#     # Define the pattern to search for
#     pattern = r'முக்கிய நகரம்\|கிராமம்(?:\s*:\s*)?(.*)'
#     # Search for the pattern in the text
#     match = re.search(pattern, text)
#     if match:
#         # Extract the text following the pattern
#         extracted_text = match.group(1)
#         print(extracted_text)
#     else:
#         print("Pattern not found in the text.")


In [None]:
# import re
# from ocr_tamil.ocr import OCR

# # Define a list of image paths
# image_paths = [r"data.png"]  # Insert your own image paths here

# # Initialize the OCR object with text detection enabled
# ocr = OCR(detect=True)

# # Use OCR to predict text from each image in the list
# text_list = ocr.predict(image_paths)


# # Iterate through the text_list and print the extracted text from each image
# for text in text_list:
#     text = " ".join(text)
#     print(text)
#     # Define the pattern to search for
#     pattern = r'முக்கிய நகரம்\|கிராமம்(?:\s*:\s*)?(.*)'
#     # Search for the pattern in the text
#     match = re.search(pattern, text)
#     if match:
#         # Extract the text following the pattern
#         extracted_text = match.group(1)
#         # Split the extracted text into sentences
#         sentences = re.split(r'[.!?]', extracted_text)
#         # Extract the first sentence and remove leading/trailing spaces
#         first_sentence = sentences[0].strip()
#         print(first_sentence)
#     else:
#         print("Pattern not found in the text.")


In [None]:
# from ocr_tamil.ocr import OCR

# def extract_info(text):
#     important_place_index = text.find("முக்கிய நகரம்|கிராமம் :")
#     post_office_index = text.find("அஞ்சல் அலுவலகம் :")
    
#     important_place = ""
#     post_office = ""
    
#     if important_place_index != -1:
#         important_place = text[important_place_index + len("முக்கிய நகரம்|கிராமம் :"):post_office_index].strip()
#     if post_office_index != -1:
#         post_office = text[post_office_index + len("அஞ்சல் அலுவலகம் :"):].strip()
    
#     return important_place, post_office

# # Define a list of image paths
# image_paths = [r"okok2.png"]  # Insert your own image paths here

# # Initialize the OCR object with text detection enabled
# ocr = OCR(detect=True)

# # Use OCR to predict text from each image in the list
# text_list = ocr.predict(image_paths)

# # Iterate through the text_list and print the extracted text from each image
# for text in text_list:
#     important_place, post_office = extract_info(" ".join(text))
#     print("Important Place:", important_place)
#     print("Post Office:", post_office)


In [None]:
from ocr_tamil.ocr import OCR

def extract_info(text):
    important_place_index = text.find("முக்கிய நகரம்|கிராமம் :")
    post_office_index = text.find("அஞ்சல் அலுவலகம் :")
    
    important_place = ""
    post_office = ""
    
    if important_place_index != -1:
        important_place = text[important_place_index + len("முக்கிய நகரம்|கிராமம் :"):post_office_index].strip()
    if post_office_index != -1:
        post_office_text = text[post_office_index + len("அஞ்சல் அலுவலகம் :"):].strip()
        # Split the post_office_text based on spaces and consider only the first part
        post_office_parts = post_office_text.split()
        post_office = post_office_parts[0] if post_office_parts else ""
    
    return important_place, post_office

# Define a list of image paths
image_paths = [r"okok.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    important_place, post_office = extract_info(" ".join(text))
    print("Important Place:", important_place)
    print("Post Office:", post_office)


In [3]:
from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"data.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    print(" ".join(text))


1கீழப்பாவூர்  மேலூர் மேலத் தெரு -கீழப்பாவூர்  மேலூர் வடக்குத் தெரு 3கீழப்பாவூர் (Cu), மேலூர் தெற்குத் தெரு: 4கீழப்பாவூர் (Cu), மேலூர் ரோடு 5கீழப்பாவூர்  குருக்கள் மடம், 6கீழப்பாவூர் (Cu), மேட்டுத் தெரு வடக்கு; 7கீழப்பாவூர்  மேட்டுத் தெரு தெற்கு 99-அயல்நாடு வாழ் வாக்காளர்கள்


In [4]:
from ocr_tamil.ocr import OCR

# Define a list of image paths
image_paths = [r"data2.png"]  # Insert your own image paths here

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Iterate through the text_list and print the extracted text from each image
for text in text_list:
    print(" ".join(text))


1கீழப்பாவூர்  மேலூர் மேலத் தெரு: _கீழப்பாவூர் பே} மேலூர் வடக்குத் தெரு 3கீழப்பாவூர் பே} மேலூர் தெற்குத் தெரு: 4கீழப்பாவூர் (Cu), மேலூர் ரோடு, 5கீழப்பாவூர் பே} குருக்கள் மடம் 6கீழப்பாவூர் பே} மேட்டுத் தெரு வடக்கு; 7கீழப்பாவூர் (Cu), மேட்டுத் தெரு தெற்கு 99_அயல்நாடு வாழ் வாக்காளர்கள்


In [1]:
from ocr_tamil.ocr import OCR
import os

# Define a list of image paths
image_dir = r"C:\Users\haric\OneDrive\Desktop\left_half"  # Directory containing images
image_files = os.listdir(image_dir)
image_paths = [os.path.join(image_dir, filename) for filename in image_files]

# Initialize the OCR object with text detection enabled
ocr = OCR(detect=True)

# Use OCR to predict text from each image in the list
text_list = ocr.predict(image_paths)

# Prepare output file path
output_file = "ocr_results.txt"

# Write results to a text file
with open(output_file, "w", encoding="utf-8") as f:
    # Iterate through the text_list and write extracted text along with image names
    for image_path, text in zip(image_paths, text_list):
        image_name = os.path.basename(image_path)
        print(image_name)
        extracted_text = " ".join(text)
        f.write(f"{image_name}\t{extracted_text}\n")

print(f"OCR results saved to {output_file}")


ac223001_page_1.png
ac223002_page_1.png
ac223003_page_1.png
ac223004_page_1.png
ac223005_page_1.png
ac223006_page_1.png
ac223007_page_1.png
ac223008_page_1.png
ac223009_page_1.png
ac223010_page_1.png
ac223011_page_1.png
ac223012_page_1.png
ac223013_page_1.png
ac223014_page_1.png
ac223015_page_1.png
ac223016_page_1.png
ac223017_page_1.png
ac223018_page_1.png
ac223019_page_1.png
ac223020_page_1.png
ac223021_page_1.png
ac223022_page_1.png
ac223023_page_1.png
ac223024_page_1.png
ac223025_page_1.png
ac223026_page_1.png
ac223027_page_1.png
ac223028_page_1.png
ac223029_page_1.png
ac223030_page_1.png
ac223031_page_1.png
ac223032_page_1.png
ac223033_page_1.png
ac223034_page_1.png
ac223035_page_1.png
ac223036_page_1.png
ac223037_page_1.png
ac223038_page_1.png
ac223039_page_1.png
ac223040_page_1.png
ac223041_page_1.png
ac223042_page_1.png
ac223043_page_1.png
ac223044_page_1.png
ac223045_page_1.png
ac223046_page_1.png
ac223047_page_1.png
ac223048_page_1.png
ac223049_page_1.png
ac223050_page_1.png


In [None]:
# from PIL import Image
# import os

# # Path to the directory containing PNG images
# input_folder = r'C:\Users\haric\OneDrive\Desktop\croppedimages'

# # Create output folders for left and right halves
# left_output_folder = os.path.join(input_folder, 'left_half')
# right_output_folder = os.path.join(input_folder, 'right_half')

# # Ensure output folders exist, if not create them
# os.makedirs(left_output_folder, exist_ok=True)
# os.makedirs(right_output_folder, exist_ok=True)

# # Loop through each file in the input folder
# for filename in os.listdir(input_folder):
#     if filename.endswith(".png"):  # Process only PNG files
#         input_path = os.path.join(input_folder, filename)
        
#         # Open the image
#         img = Image.open(input_path)
#         width, height = img.size
        
#         # Calculate the dimensions for left and right halves
#         half_width = width // 2
        
#         # Crop and save the left half
#         left_half = img.crop((0, 0, half_width, height))
#         left_output_path = os.path.join(left_output_folder, filename)
#         left_half.save(left_output_path)
        
#         # Crop and save the right half
#         right_half = img.crop((half_width, 0, width, height))
#         right_output_path = os.path.join(right_output_folder, filename)
#         right_half.save(right_output_path)

#         # Close the image
#         img.close()

# print("Image cropping and saving complete.")


In [None]:
# from PIL import Image
# import os

# # Path to the directory containing PNG images
# input_folder = r'C:\Users\haric\OneDrive\Desktop\croppedimages'

# # Create output folders for left and right halves
# left_output_folder = os.path.join('left_half')
# right_output_folder = os.path.join('right_half')

# # Ensure output folders exist, if not create them
# os.makedirs(left_output_folder, exist_ok=True)
# os.makedirs(right_output_folder, exist_ok=True)

# # Fixed x-coordinate for the center point
# center_x = 730  # Adjust this value based on your desired center point

# # Loop through each file in the input folder
# for filename in os.listdir(input_folder):
#     if filename.endswith(".png"):  # Process only PNG files
#         input_path = os.path.join(input_folder, filename)
        
#         # Open the image
#         img = Image.open(input_path)
#         width, height = img.size
        
#         # Calculate the dimensions for left and right halves
#         half_width = width // 2
        
#         # Calculate the starting x-coordinate for the right half
#         if center_x <= half_width:
#             right_half_start = half_width + (center_x - half_width)
#         else:
#             right_half_start = center_x
        
#         # Crop and save the left half
#         left_half = img.crop((0, 0, center_x, height))
#         left_output_path = os.path.join(left_output_folder, filename)
#         left_half.save(left_output_path)
        
#         # Crop and save the right half
#         right_half = img.crop((right_half_start, 0, width, height))
#         right_output_path = os.path.join(right_output_folder, filename)
#         right_half.save(right_output_path)

#         # Close the image
#         img.close()

# print("Image cropping and saving complete.")


In [None]:
# import os
# import shutil

# # Source folder containing images
# source_folder = r'C:\Users\haric\OneDrive\Desktop\croppedimages'

# # Destination folder to copy images
# destination_folder = r'C:\Users\haric\OneDrive\Desktop\error'

# # List of specific filenames to copy
# filenames_to_copy = [
#     'ac223025_page_1.png', 'ac223026_page_1.png', 'ac223031_page_1.png', 'ac223035_page_1.png',
#     'ac223045_page_1.png', 'ac223053_page_1.png', 'ac223062_page_1.png', 'ac223075_page_1.png',
#     'ac223076_page_1.png', 'ac223077_page_1.png', 'ac223095_page_1.png', 'ac223097_page_1.png',
#     'ac223145_page_1.png', 'ac223146_page_1.png', 'ac223147_page_1.png', 'ac223148_page_1.png',
#     'ac223149_page_1.png', 'ac223150_page_1.png', 'ac223154_page_1.png', 'ac223159_page_1.png',
#     'ac223162_page_1.png', 'ac223165_page_1.png', 'ac223166_page_1.png', 'ac223192_page_1.png',
#     'ac223206_page_1.png', 'ac223217_page_1.png', 'ac223218_page_1.png', 'ac223229_page_1.png',
#     'ac223236_page_1.png', 'ac223245_page_1.png', 'ac223252_page_1.png', 'ac223255_page_1.png',
#     'ac223258_page_1.png', 'ac223260_page_1.png', 'ac223263_page_1.png', 'ac223265_page_1.png',
#     'ac223295_page_1.png', 'ac223297_page_1.png'
# ]

# # Iterate over each filename and copy corresponding images
# for filename in filenames_to_copy:
#     source_path = os.path.join(source_folder, filename)
#     if os.path.exists(source_path):
#         destination_path = os.path.join(destination_folder, filename)
#         shutil.copyfile(source_path, destination_path)
#         print(f"Copied: {filename}")
#     else:
#         print(f"File not found: {filename}")

# print("Copy process completed.")


%%time
from pdf2image import convert_from_path
import os

# Path to the folder containing PDF files
pdf_folder = "dataset"

# Output folder to save images
output_parent_folder = "data"

# Path to the Poppler executable
poppler_path = r"C:/poppler-24.02.0/Library/bin"


# Create the output parent folder if it doesn't exist
os.makedirs(output_parent_folder, exist_ok=True)

# Iterate over each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        # Create a folder with the PDF file's name
        pdf_name = os.path.splitext(pdf_file)[0]
        output_folder = os.path.join(output_parent_folder, pdf_name)
        os.makedirs(output_folder, exist_ok=True)
        
        # Convert only the first page of the PDF to an image
        images = convert_from_path(
            os.path.join(pdf_folder, pdf_file),
            poppler_path=poppler_path,
            first_page=1, last_page=1  # Extract only the first page
        )
        
        # Save the image in the output folder
        images[0].save(os.path.join(output_folder, f"page_1.png"), "PNG", quality=100)

print("First pages saved successfully.")


hrsru;ii'o
pp[
    p[
        iit
    ]
]

In [1]:
%%time 

for i in range(1,8):
    print(i)


1
2
3
4
5
6
7
CPU times: total: 0 ns
Wall time: 0 ns
