In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import cv2
import os
import re
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'D:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
# Function to preprocess the image
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.medianBlur(gray, 5)  # Apply some denoising
    return gray

# Function to extract text from image using Tesseract
def extract_text_from_image(image):
    custom_config = r'--oem 3 --psm 6'  # Adjusting psm for best extraction
    text = pytesseract.image_to_string(image, config=custom_config)
    return text

# Function to clean the text aggressively
def clean_extracted_text(text):
    clean_text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    clean_text = re.sub(r'[^\w\s.,:/-]', '', clean_text)  # Remove special symbols
    clean_text = re.sub(r'\s+', ' ', clean_text)  # Normalize whitespace
    return clean_text

# Function to extract top part details (S.No., Regn.No., Board, etc.)
def extract_top_details(text):
    top_details = {}
    
    # Extract S.No.
    s_no_match = re.search(r'S\.?No\.\s*(\d+)', text)
    top_details['S.No.'] = s_no_match.group(1) if s_no_match else 'N/A'
    
    # Extract Regn.No.
    regn_no_match = re.search(r'Regn\.? No\.?\s*([\w\/]+)', text)
    top_details['Regn.No.'] = regn_no_match.group(1) if regn_no_match else 'N/A'
    
    # Extract Board Information
    board_match = re.search(r'Central Board of Secondary Education', text, re.IGNORECASE)
    top_details['Board'] = "Central Board of Secondary Education" if board_match else 'N/A'
    
    # Extract Year (year mentioned at the end of the line)
    year_match = re.search(r'(\d{4})', text)
    top_details['Year'] = year_match.group(1) if year_match else 'N/A'

    return top_details

# Function to extract student and school details
def extract_student_details(text):
    student_data = {}

    # Extract Name
    name_match = re.search(r'This is to certify that\s*([\w\s]+)', text)
    student_data['Name'] = name_match.group(1).strip() if name_match else 'N/A'

    # Extract Roll No.
    roll_no_match = re.search(r'Roll No\.\s*(\d+)', text)
    student_data['Roll No.'] = roll_no_match.group(1) if roll_no_match else 'N/A'

    # Extract Mother's Name
    mother_name_match = re.search(r"Mother's Name\s*([\w\s]+)", text)
    student_data['Mother\'s Name'] = mother_name_match.group(1).strip() if mother_name_match else 'N/A'

    # Extract Father's/Guardian's Name
    father_name_match = re.search(r"Father's / Guardian's Name\s*([\w\s]+)", text)
    student_data['Father\'s Name'] = father_name_match.group(1).strip() if father_name_match else 'N/A'

    # Extract School (with Code)
    school_match = re.search(r'School\s*(\d+)\s*([\w\s]+)', text)
    student_data['School Code'] = school_match.group(1) if school_match else 'N/A'
    student_data['School Name'] = school_match.group(2).strip() if school_match else 'N/A'

    return student_data

# Function to extract subject details and convert total (in words) to integer
def extract_subject_details(text):
    subjects = []

    # Pattern to extract subject name and total marks (in words)
    subject_pattern = r'(\d{3})\s+([\w\s]+)\s+\d+\s+\d+\s+(\w+)\s'
    matches = re.finditer(subject_pattern, text)

    # Dictionary to convert number words to digits
    words_to_numbers = {
        'ZERO': 0, 'ONE': 1, 'TWO': 2, 'THREE': 3, 'FOUR': 4,
        'FIVE': 5, 'SIX': 6, 'SEVEN': 7, 'EIGHT': 8, 'NINE': 9,
        'TEN': 10, 'ELEVEN': 11, 'TWELVE': 12, 'THIRTEEN': 13,
        'FOURTEEN': 14, 'FIFTEEN': 15, 'SIXTEEN': 16, 'SEVENTEEN': 17,
        'EIGHTEEN': 18, 'NINETEEN': 19, 'TWENTY': 20, 'THIRTY': 30,
        'FORTY': 40, 'FIFTY': 50, 'SIXTY': 60, 'SEVENTY': 70,
        'EIGHTY': 80, 'NINETY': 90, 'HUNDRED': 100
    }

    for match in matches:
        subject_code = match.group(1)
        subject_name = match.group(2).strip()
        total_in_words = match.group(3).upper()
        
        # Convert the total (in words) to an integer
        total_in_numbers = sum(words_to_numbers.get(word, 0) for word in total_in_words.split())
        
        subjects.append({
            'Subject Code': subject_code,
            'Subject Name': subject_name,
            'Total Marks': total_in_numbers
        })

    return subjects

# Function to extract result (PASS/FAIL) and result publication date
def extract_result_and_date(text):
    result_data = {}

    # Extract Result (PASS/FAIL)
    result_match = re.search(r'Result\s*(PASS|FAIL)', text, re.IGNORECASE)
    result_data['Result'] = result_match.group(1).upper() if result_match else 'N/A'

    # Extract Date of Result
    date_match = re.search(r'Dated\s*(\d{2}-\d{2}-\d{4})', text)
    result_data['Result Date'] = date_match.group(1) if date_match else 'N/A'

    return result_data

# Main function to process each image and extract all details
def process_marksheet(image_path):
    # Step 1: Preprocess the image
    preprocessed_image = preprocess_image(image_path)
    
    # Step 2: Extract text from image
    extracted_text = extract_text_from_image(preprocessed_image)
    print(f"Extracted Text from {image_path}:\n", extracted_text)
    
    # Step 3: Clean the extracted text
    cleaned_text = clean_extracted_text(extracted_text)
    print(f"Cleaned Text from {image_path}:\n", cleaned_text)

    # Step 4: Extract Top Details (S.No., Regn.No., Board, Year)
    top_details = extract_top_details(cleaned_text)
    print(f"Top Details: {top_details}")
    
    # Step 5: Extract Student Details (Name, Roll No., etc.)
    student_details = extract_student_details(cleaned_text)
    print(f"Student Details: {student_details}")

    # Step 6: Extract Subject Details (Subjects and Total Marks)
    subject_details = extract_subject_details(cleaned_text)
    print(f"Subject Details: {subject_details}")

    # Step 7: Extract Result and Result Publication Date
    result_data = extract_result_and_date(cleaned_text)
    print(f"Result and Date: {result_data}")

In [None]:
# Example usage
if __name__ == "__main__":
    # Image path (you can replace this with the actual image path)
    image_path = '1.jpg'
    
    # Process the marksheet image
    process_marksheet(image_path)