In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import cv2
import os
import re
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'D:\Program Files\Tesseract-OCR\tesseract.exe'

In [2]:
# Noise Removal
def noise_removal(image):
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.dilate(image, kernel, iterations=1)
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.erode(image, kernel, iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image, 3)
    return (image)

# Removing Borders
def remove_borders(image):
    contours, heiarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cntsSorted = sorted(contours, key=lambda x:cv2.contourArea(x))
    cnt = cntsSorted[-1]
    x, y, w, h = cv2.boundingRect(cnt)
    crop = image[y:y+h, x:x+w]
    return (crop)

In [3]:
# Function to preprocess the image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.bitwise_not(image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 5)# Apply some denoising
    no_noise = noise_removal(gray)
    no_borders = remove_borders(no_noise)
    return no_borders

# Function to extract text from image using Tesseract
def extract_text_from_image(image):
    custom_config = r'--oem 3 --psm 6'  # Adjusting psm for best extraction
    text = pytesseract.image_to_string(image, config=custom_config)
    return text

# Function to clean the text aggressively
def clean_extracted_text(text):
    clean_text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    clean_text = re.sub(r'[^\w\s.,:/-]', '', clean_text)  # Remove special symbols
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Normalize whitespace
    return clean_text

# Main function to process each image and extract all details
def process_marksheet(image_path):
    # Step 1: Preprocess the image
    preprocessed_image = preprocess_image(image_path)
    
    # Step 2: Extract text from image
    extracted_text = extract_text_from_image(preprocessed_image)
    print(f"Extracted Text from {image_path}:\n", extracted_text)
    
    # Step 3: Clean the extracted text
    cleaned_text = clean_extracted_text(extracted_text)
    print(f"Cleaned Text from {image_path}:\n", cleaned_text)

In [4]:
# Example usage
if __name__ == "__main__":
    # Image path (you can replace this with the actual image path)
    image_path = 'images/11.jpg'
    
    # Process the marksheet image
    process_marksheet(image_path)

Extracted Text from images/11.jpg:
 Aree ' 273840 on 3 ae o =. am - tiird
Ps | 0943390 ff / SOO Rogn.No. A119/16397/0039 be)
Ae: . eYa'ot a * 1S,
(si wT E)
, CENTRAL BOARD OF SECONDARY EDUCATION i
e) MARKS STATEMENT CUM CERTIFICATE = )
| meats; farerera utter, 2019 &
4 ALLINDIA SECONDARY SCHOOL EXAMINATION, 2019 eA
¥, hs
> ae warftra feat rat % f% This is to certify that KUSHAL MITTAL =)
\ 7) SPRATS Roll No. 1131444 i
(—) A1aT &1 ATH Mother's Name DR SHOBHA MITTAL iE)
©) ftar/azee @ ava Father's / Guardian’s Name ANAND KUMAR MITTAL x
| gai faf& Date of Birth 04/08/2004 4TH AUGUST TWO THOUSAND FOUR ay
» farsa School 16397-BIRLA INT SCHOOL BANDER SINDRI AJMER 4
¥ S tatre weafaai Praga % has achieved Scholastic Achievements as under : ts]
4
[feet aoe | fara te »
5 ee SUBJECT fafera | me a | art (arett #) ‘| POSITIONAL] |
fae | CODE | THEORY | wT pe | TOTAL |TOTAL (IN WoRDS)| GRADE | jf)
5 j 101 ENGLISH COMM. 069 018 087 EIGHTY SEVEN iy
re ; 122 SANSKRIT 06s 020 08s EIGHTY FIVE :
“| 061 | 