In [10]:
import os
import re
import cv2
import pytesseract as tess
tess.pytesseract.tesseract_cmd = os.path.join(r'Tesseract-OCR\tesseract.exe')


In [22]:
def extract_text_from_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    
    # Display the original image
    #cv2.imshow('Original Image', image)
    
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply some preprocessing (optional, depends on the image)
    gray_image = cv2.medianBlur(gray_image, 3)  # Reduce noise
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Display the processed image
    #cv2.imshow('Processed Image', binary_image)
    
    # Wait for a key press and close the image windows
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
    # Use Tesseract to extract text
    text = tess.image_to_string(binary_image)
    
    return text



In [25]:
def extract_data_from_text(text):
    data = {
        'nombre': '',
        'domicilio': '',
        'clave_elector': '',
        'curp': '',
        'registro': '',
        'estado': '',
        'municipio': '',
        'seccion': '',
        'localidad': '',
        'emision': '',
        'vigencia': '',
        'nacimiento': '',
        'sexo': ''
    }
    
    # Define a dictionary of regex patterns
    patterns = {
        'domicilio': re.compile(r'DOMICILIO\s*(.*)'),
        'clave_elector': re.compile(r'CLAVE\s*DE\s*ELECTOR\s*(\S+)'),
        'curp': re.compile(r'CURP\s*(\S+)'),
        'registro': re.compile(r'REGISTRO\s*(\d{4})'),
        'estado': re.compile(r'ESTADO\s*(\d{2})'),
        'municipio': re.compile(r'MUNICIPIO\s*(\d{2})'),
        'seccion': re.compile(r'SECCION\s*(\d{4})'),
        'localidad': re.compile(r'LOCALIDAD\s*(.*)'),
        'emision': re.compile(r'EMISION\s*(\d{4})'),
        'vigencia': re.compile(r'VIGENCIA\s*(\d{4})'),
        'sexo': re.compile(r'SEXO\s*([HM])')
    }

    # Extract NOMBRE and FECHA DE NACIMIENTO
    name_birth_regex = re.compile(r'NOMBRE\s*([A-Z\s]+)\s*FECHA\s*DE\s*NACIMIENTO\s*(\d{2}/\d{2}/\d{4})', re.MULTILINE)
    name_birth_match = name_birth_regex.search(text)
    print(name_birth_match)
    if name_birth_match:
        data['nombre'] = name_birth_match.group(1).strip()
        data['nacimiento'] = name_birth_match.group(2).strip()

    # Search for the other patterns and extract the data using a loop
    for key, pattern in patterns.items():
        match = pattern.search(text)
        print(match)
        if match:
            data[key] = match.group(1).strip()

    return data

In [61]:
def structure_data(text):

    lines = text.split('\n')
    data = {}
    for i in range(len(lines)):
        data[i] = lines[i]

    #return data
    # Print the resulting dictionary
    for key, value in data.items():
        print(f"{key}: {value}")

    return data

In [291]:
class ExtractData:

    def __init__(self, img_path):
        self.img_path = img_path
        
        self.doc_text = ''
        self.text_struct = ''
        self.indx_clv_elec = -1

        self.indx_ready = set()

        self.msgs = []

        self.data_f = {
                'nombre': '',
                'domicilio': '',
                'clave_elector': '',
                'curp': '',
                'registro': '',
                'estado': '',
                'municipio': '',
                'seccion': '',
                'localidad': '',
                'emision': '',
                'vigencia': '',
                'nacimiento': '',
                'sexo': ''
            }
    
    #Extract text
    def extract_text_from_image(self):
        # Load the image using OpenCV
        image = cv2.imread(self.img_path)
        
        # Display the original image
        #cv2.imshow('Original Image', image)
        image = cv2.resize(image, (0,0), fx=1.8, fy=1.8)
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Apply some preprocessing (optional, depends on the image)
        gray_image = cv2.medianBlur(gray_image, 3)  # Reduce noise

        #cv2.imshow('Gray Scale', gray_image)
        _, binary_image = cv2.threshold(gray_image, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Display the processed image
        cv2.imshow('Processed Image', binary_image)
        
        # Wait for a key press and close the image windows
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        # Use Tesseract to extract text
        text = tess.image_to_string(binary_image)
        #text = tess.image_to_string(binary_image, lang='spa', config=f'--oem 3 --psm 6  -c tessedit_char_whitelist="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789()/-.calmg* "')

        self.doc_text = text
        
        return True  
        
    def structure_data(self):

        lines = self.doc_text.split('\n')
        data = {}
        count = 0
        for i in range(len(lines)):

            if lines[i] != '':
                data[count] = lines[i]
                count += 1

        self.text_struct = data

        return True
    
    def extract_NOMBRE_NACIM_SEX(self):
        pattern = re.compile(r'\b\w*NOMBRE\w*\b', re.IGNORECASE)

        for key, line in self.text_struct.items():

            match = pattern.search(line)

            if match:
                self.indx_ready.add(key) #Index where the word 'NAME' is.

                line_data_1 = self.text_struct[key+1].split(' ')
                self.indx_ready.add(key + 1) #Index where the word First last name is.
                last_name_1 = line_data_1[0]

                #The born date is in the same line as the first last name
                self.data_f['nacimiento'] = line_data_1[1]

                line_data_2 = self.text_struct[key + 2].split(' ')
                self.indx_ready.add(key + 2) #Index where the word Second last name is.
                last_name_2 = line_data_2[0]

                #The sex is in the same line as the second last name
                if len(line_data_2) >= 3:
                    
                    self.data_f['sexo'] = line_data_2[2]
                
                raw_name = self.text_struct[key + 3].split(' ')
                self.indx_ready.add(key + 3) #Index where the word Name is.

                filter_name = [part for part in raw_name if len(part) > 1]
        
                # Join the filtered words into a single string
                name = ' '.join(filter_name)
                
                self.data_f['nombre'] = f'{name} {last_name_1} {last_name_2}'

    #Find Data   
    def extract_DOMICILIO(self):

        pattern = re.compile(r'\bDOMIC\w*\b', re.IGNORECASE)

        for key, value in self.text_struct.items():
            if pattern.search(value):

                indx_domici = key
                direc = ''
                for key_2, value in self.text_struct.items():
                    if indx_domici < key_2 < self.indx_clv_elec:
                        direc += ' ' + value

                self.data_f['domicilio'] = direc
                return True
            
        print('Domicilio not found')
        return False
    
    def extract_CLAVE_DE_ELECTOR(self):

        pattern = re.compile(r'\bCLAVE\s+DE\s+ELECTOR\s+(\S+)', re.IGNORECASE)

        for key, line in self.text_struct.items():

            match = pattern.search(line)

            # Check if the pattern matched
            if match:
                self.indx_ready.add(key + 2) #Index where the word Clave De ELECTOR is.

                extracted_value = match.group(1)  # Get the first capturing group
                self.data_f['clave_elector'] = extracted_value
                self.indx_clv_elec = key
                return True
            
        print('CLAVE DE ELECTOR not found')
        return False
    
    def extract_EDO_MUNP_SECC(self):

        # Pattern to match any word containing 'ESTADO'
        pattern_edo = re.compile(r'\b\w*ESTA\w*\b', re.IGNORECASE)

        # Iterate over the lines in self.text_struct
        for key, line in self.text_struct.items():
            
            # Search for the pattern in the line
            match = pattern_edo.search(line)
            
            # Print whether a match was found
            if match:
                self.indx_ready.add(key) #Index with the 3 values
                
                line_values = line.split(' ')
                self.data_f['estado'] = line_values[1]
                self.data_f['municipio'] = line_values[3]
                self.data_f['seccion'] = line_values[5]

    def extract_CURP_REGIS(self):

        pattern = re.compile(r'\b\w*CURP\w*\b', re.IGNORECASE)

        for key, line in self.text_struct.items():

            match = pattern.search(line)

            if match:
                self.indx_ready.add(key) #Index with the CURP
                line_data = self.text_struct[key].split(' ')

                self.data_f['curp'] = line_data[1]

                self.data_f['registro'] = line_data[-2]
                



                return True
        self.msgs.append({'detail':'CURP not found'})
        return False

    def extract_LOC_EMIS_VIGEN(self):

        # Pattern to match any word containing 'ESTADO'
        pattern_edo = re.compile(r'\b\w*LOCALIDAD\w*\b', re.IGNORECASE)

        # Iterate over the lines in self.text_struct
        for key, line in self.text_struct.items():
            
            # Search for the pattern in the line
            match = pattern_edo.search(line)
            
            # Print whether a match was found
            if match:
                self.indx_ready.add(key) #Index with the 3 values

                line_values = line.split(' ')
                try:
                    self.data_f['localidad'] = line_values[1]
                except:
                    self.msgs.append({'detail':'LOCALIDAD not found'})
                    
                try:
                    self.data_f['municipio'] = line_values[3]
                except:
                    self.msgs.append({'detail':'SECCION not found'})

                try:
                    self.data_f['seccion'] = line_values[5]
                except:
                    self.msgs.append({'detail':'SECCION not found'})

                return True
            
        self.msgs.append({'detail':'LOCALIDAD not found'})
        return False
    
    def start_finding(self):

        self.extract_NOMBRE_NACIM_SEX()
        self.extract_CLAVE_DE_ELECTOR()
        if self.indx_clv_elec != -1:
            self.extract_DOMICILIO()
        self.extract_EDO_MUNP_SECC()
        self.extract_CURP_REGIS()

    def execute(self):

        self.extract_text_from_image()
        self.structure_data()

        self.start_finding()

        


In [292]:
extract = ExtractData(img_path='image2.jpg')
extract.execute()
print(extract.msgs)
extract.data_f


[]


{'nombre': 'ALEJANDRA PATRICIA AVILES OLGUIN',
 'domicilio': ' CTO PUENTE DE ALCANTARA 9558 FRACC PRIVANZZA PUENTE DE CANTERA 3114 CHIHUAHUA, CHIH. ;',
 'clave_elector': 'AVOLAL01070619M30',
 'curp': 'AIOA010706MNLVLLAO',
 'registro': '2019',
 'estado': '08',
 'municipio': 'sECcION',
 'seccion': 'S',
 'localidad': '',
 'emision': '',
 'vigencia': '',
 'nacimiento': '06/07/2001',
 'sexo': ''}

In [293]:
extract.indx_ready

{6, 7, 8, 9, 15, 16}

In [295]:
extract.text_struct

{0: ': =',
 1: '3',
 2: '3S',
 3: 'S',
 4: 'LB',
 5: 'ca',
 6: 'NOMBRE FECHA DE NACIMIENTO',
 7: 'AVILES 06/07/2001',
 8: 'OLGUIN',
 9: 'ALEJANDRA PATRICIA',
 10: 'DOMICILIO',
 11: 'CTO PUENTE DE ALCANTARA 9558',
 12: 'FRACC PRIVANZZA PUENTE DE CANTERA 3114',
 13: 'CHIHUAHUA, CHIH. ;',
 14: 'CLAVE DE ELECTOR AVOLAL01070619M30 a',
 15: 'curp AIOA010706MNLVLLAO asloperecistro 2019 00°',
 16: 'estap0 08 muNcPIO.019 sECcION 0897 S',
 17: 'w oT,',
 18: 'Ley. ° :',
 19: 'ocauoap 0001 emision 2019s ROA , : me',
 20: ', ay',
 21: 'BS)',
 22: '~~',
 23: 'AE.',
 24: 'Sie',
 25: 'Pe'}

In [296]:
extract.doc_text.split('\n')

[': =',
 '3',
 '3S',
 'S',
 'LB',
 '',
 'ca',
 '',
 'NOMBRE FECHA DE NACIMIENTO',
 'AVILES 06/07/2001',
 'OLGUIN',
 '',
 'ALEJANDRA PATRICIA',
 '',
 'DOMICILIO',
 '',
 'CTO PUENTE DE ALCANTARA 9558',
 '',
 'FRACC PRIVANZZA PUENTE DE CANTERA 3114',
 'CHIHUAHUA, CHIH. ;',
 '',
 'CLAVE DE ELECTOR AVOLAL01070619M30 a',
 'curp AIOA010706MNLVLLAO asloperecistro 2019 00°',
 '',
 'estap0 08 muNcPIO.019 sECcION 0897 S',
 '',
 'w oT,',
 '',
 'Ley. ° :',
 '',
 'ocauoap 0001 emision 2019s ROA , : me',
 ', ay',
 '',
 'BS)',
 '',
 '~~',
 'AE.',
 'Sie',
 'Pe',
 '',
 '']