In [1]:
import spacy

In [2]:
from pypdf import PdfReader
import pypdf
import logging

logging.getLogger("pypdf._reader").setLevel(logging.ERROR)

def convert_pdf_to_text(file_path: str) -> None:
    """
    This function converts PDF file to string. 
    Because email and name must appear on the first page so, return information only on the first page. 
    
    Args:
        file_path: path of pdf file
    """
    
    reader = PdfReader(file_path)

    page = reader.pages[0] # only on first page
    text = page.extract_text()
    
    text = text.replace("\n"," ")
    text = text.replace("\r", " ")
    text = " ".join(text.split(" "))
    return text 

In [3]:
def predict(model, file_path):
    text = convert_pdf_to_text(file_path)
    predictions = model(text) 
    predictions = predictions.to_json()['ents']

    for i in range(len(predictions)):
        result = predictions[i]
        start = result['start']
        end = result['end']
        label = result['label']
        text_result = text[start:end]

        if label == "Email":
            text_result = "".join(text_result.split(" "))

        print(f" - {label}: {text_result}")

In [4]:
import os

def predict_in_folder(model, folder_path):
    list_files = os.listdir(folder_path)

    for pdf_path in list_files:
        base_pdf_path = os.path.join(folder_path,pdf_path)
        print(f"At {pdf_path}:")
        predict(model, base_pdf_path)
    

In [5]:
model = spacy.load("output/model-best") #load the best model

In [6]:
predict(model,'resumes/android_01.pdf')

 - Name: Nguyen Thanh Xuan
 - Email: xuanusm@gmail.com


In [7]:
predict_in_folder(model, 'resumes')

At android_01.pdf:
 - Name: Nguyen Thanh Xuan
 - Email: xuanusm@gmail.com
At android_02.pdf:
 - Name: Hoàng Văn Dương
 - Email: duonghv@fsoft.com.vn
At android_03.pdf:
 - Email: kimtungcdt@gmail.com
 - Name: Le Kim Tung
At android_04.pdf:
 - Name: TRUONG THANH NGUYEN
At android_05.pdf:
 - Name: Toan Tran
 - Email: me@toan.mobi
At android_06.pdf:
 - Name: Nguyen Dang Phat
 - Email: phatnd89@gmail.com
At android_07.pdf:
 - Name: Tran  Chi Cao
 - Email: cao.tranchi@gmail.com
At android_09.pdf:
 - Name: DO HOANG YEN
 - Email: dohoangyen93@gmail.com
At android_10.pdf:
 - Name: HUỲNH THÁI TÂM
 - Email: tel.2406@gmail.com
At android_11.pdf:
 - Name: BUI DUNG ANH TUAN
 - Email: anhtuanbd84@gmail.com
At android_12.pdf:
 - Name: Hoàng Thanh Tùng
 - Email: tung.hoangthanh90@gmail.com
At android_13.pdf:
 - Name: HUỲNH VĂN TOÀN
 - Email: huynhvantoan.itc@gmail.com
At android_14.pdf:
 - Name: NGUYỄN CÔNG SƠN
 - Email: sonnguyen1187@gmail.com
At android_15.pdf:
 - Name: SINH TIEN HO70
 - Email: SIN

Advanced encoding /SymbolSetEncoding not implemented yet


 - Name: TRƯƠNG ĐẶNG NGUYỄN
 - Email: dangnguyenful@gmail.com
At html_js_css_19.pdf:
 - Name: NGUYEN HONG NHAT THANH
 - Email: nhatthanh510@gmail.com
At html_js_css_20.pdf:
 - Name: TIEN DO QUYET
 - Email: Tiendo1011@gmail.com
At html_js_css_21.pdf:
 - Name: Tran Phuoc Tu
 - Email: tranphuoctu@gmail.com
At html_js_css_22.pdf:
 - Name: Sang Lê Nguyễn Vĩnh
 - Email: sanglnv.87@gmail.com
At html_js_css_23.pdf:
 - Name: Thang Hoang Cao
At html_js_css_24.pdf:
 - Name: Nguyen Minh Triet
 - Email: petertriet@gmail.com
 - Email: petertriet@gmail.com
At html_js_css_25.pdf:
At html_js_css_26.pdf:
 - Name: Nguyen Minh Thang
At html_js_css_27.pdf:
 - Name: NGUYỄN BÌNH VĨ
At html_js_css_28.pdf:
 - Name: Mai	Gia	Tran
 - Email: maigiatran@gmail.com
At html_js_css_29.pdf:
 - Name: Lam Nguyen Tien
At html_js_css_30.pdf:
 - Name: TRAN QUANG MINH
 - Email: mtran8@binghamton.edu
At html_js_css_31.pdf:
 - Name: Vinh Bui Quang
 - Email: vinhbq1902@gmail.com
At html_js_css_32.pdf:
 - Name: Trần Bắc Sơn
 - Em