### IMPORTING LIBRARIES

In [1]:
import pandas as pd
import pytesseract as pyt
from pdf2image import convert_from_path
import cv2 as cv
from pathlib import Path
from PIL import Image
import re

### PATHS

In [2]:
#current directory
BASE_DIR = Path.cwd()

#go one level up 
PROJECT_DIR = BASE_DIR.parent 

#input dir
INPUT_DIR = PROJECT_DIR / 'data' / 'input'

#output dir
OUTPUT_DIR = PROJECT_DIR / 'data' / 'output'

#loading the tesseract.exe
pyt.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)

### MAIN PROGRAM

In [3]:
#importing pdf file
pdf_file = INPUT_DIR / 'input.pdf'

#store all the pages of the pdf in a variable
image_file_list = []

In [4]:
# Read in the PDF file at 500 DPI
pdf_pages = convert_from_path(pdf_file, 500)

In [5]:
# Iterate through all the pages stored in pdf_pages
for page_enum, page in enumerate(pdf_pages, start=1):
    image_file_list.append(page)

In [6]:
#loop the saved images in image_file_list
for image_file in image_file_list:
    text = str(pyt.image_to_string(image_file))

In [7]:
text

'Republic of the Philippines\nILOILO SCIENCE AND TECHNOLOGY UNIVERSITY\nBurgos St., La Paz, Iloilo City\n* E-mail address: registrar@isatu.edu.ph + Telephone No.: (033) 320-71-90 local 110\n\nOFFICIAL TRANSCRIPT OF RECORDS\n\nID No.: 2019-7667-A\n\nName: SILVA, KARL JADE CORTEZ | Date: JANUARY 12, 2024\nDate of Birth: AUGUST 31, 2000 wT Gender: MALE\n\nPlace of Birth: SARA, ILOILO\nAddress: SAN LUIS, SARA, ILOILO |\n\nOTHER PRELIMINARY RECORDS\nParent/Guardian: EUSEBIO SILVA JR. Address: SAN LUIS, SARA, ILOILO\nEntrance Cred.: SF10-SHS FROM SAN LUIS NATIONAL HIGH SCHOOL School Year: 2018-2019\nDate of Admission: MAY 20, 2019 Date of Graduation: JULY 5, 2023\nDegree/Curriculum: BACHELOR OF SCIENCE IN COMPUTER SCIENCE )\nMajor: NONE " Minor: NONE\nAACCUP Accreditation Level: LEVEL Ill RE-ACCREDITED\n\n'

In [44]:
#splitting the strings on \n and |
text_list = re.split(r'\n|\|', text)

#remove empty value
text_list = [x.strip() for x in text_list if x != '']
text_list

['Republic of the Philippines',
 'ILOILO SCIENCE AND TECHNOLOGY UNIVERSITY',
 'Burgos St., La Paz, Iloilo City',
 '* E-mail address: registrar@isatu.edu.ph + Telephone No.: (033) 320-71-90 local 110',
 'OFFICIAL TRANSCRIPT OF RECORDS',
 'ID No.: 2019-7667-A',
 'Name: SILVA, KARL JADE CORTEZ',
 'Date: JANUARY 12, 2024',
 'Date of Birth: AUGUST 31, 2000 wT Gender: MALE',
 'Place of Birth: SARA, ILOILO',
 'Address: SAN LUIS, SARA, ILOILO',
 'OTHER PRELIMINARY RECORDS',
 'Parent/Guardian: EUSEBIO SILVA JR. Address: SAN LUIS, SARA, ILOILO',
 'Entrance Cred.: SF10-SHS FROM SAN LUIS NATIONAL HIGH SCHOOL School Year: 2018-2019',
 'Date of Admission: MAY 20, 2019 Date of Graduation: JULY 5, 2023',
 'Degree/Curriculum: BACHELOR OF SCIENCE IN COMPUTER SCIENCE )',
 'Major: NONE " Minor: NONE',
 'AACCUP Accreditation Level: LEVEL Ill RE-ACCREDITED']

In [None]:
#extract school
school = text_list[1]

#extract name 
for txt in text_list:
    #name
    name_match = re.search(r'Name:\s*(.*)', txt)
    if name_match:
        name = name_match.group(1)
        name = name.replace(',', '')

    #date of birth
    #string = 'Date of Birth: AUGUST 31, 2000 wT Gender: MALE'
    dob_match = re.search(r"(Date of Birth: (.*?))(?: wT|$)", txt)
    if dob_match:
        dob = dob_match.group(2).strip()

    #place of birth
    pob_match = re.search(r"Place of Birth:\s(.*)", txt)
    if pob_match:
        pob = pob_match.group(1)

    #address
    address_match = re.match(r"^Address:\s(.*)", txt)
    if address_match:
        address = address_match.group(1)

    #date of claim
    date_claimed_match = re.match(r"^Date:\s(.*)", txt)
    if date_claimed_match:
        date_claimed = date_claimed_match.group(1)
        
    #gender
    a = re.search(r"Date of Birth: .*", txt)
    if a:
        a = a.group(0)
        gender_match = re.search(r"Gender: (.*)", a)
        gender = gender_match.group(1)

    #parent
    parent_match = re.search(r"Parent/Guardian: (.*?)(?: Address|$)", txt)
    if parent_match:
        parent = parent_match.group(1)
    
    #entrance credentials
    entrance_cred_match = re.search(r"Entrance Cred.: (.*?)(?: School|$)", txt)
    if entrance_cred_match:
        entrance_cred = entrance_cred_match.group(1)
    
    #date of admission
    dateof_add_match = re.search(r"Date of Admission: (.*?)(?: Date |$)", txt)
    if dateof_add_match:
        dateof_addmission = dateof_add_match.group(1)

    #degree
    degree_match = re.search(r"Degree/Curriculum: (.*)", txt)
    if degree_match:
        degree = degree_match.group(1).replace(')','').strip()

    #major
    major_match = re.search(r"Major: (.*?)(?: |$)", txt)
    if major_match:
        major = major_match.group(1)
    
    #minor
    minor_match = re.search(r"Minor: (.*)", txt)
    if minor_match:
        minor = minor_match.group(1)

In [None]:
#making dataframe
df = pd.DataFrame({'SCHOOL' : [school],
                   'NAME' : [name],
                   'DATE OF BIRTH' : [dob],
                   'PLACE OF BIRTH' : [pob],
                   'ADDRESS' : [address],
                   'DATE CLAIMED' : [date_claimed],
                   'GENDER' : [gender],
                   'PARENT' : [parent],
                   'ENTRANCE CREDENTIALS' : [entrance_cred],
                   'DATE OF ADMISSION' : [dateof_addmission],
                   'DEGREE' : [degree],
                   'MAJOR' : [major],
                   'MINOR' : [minor]})
df = df.replace(r"[,-.]", "", regex=True)