    Extract text from PDF files with PyPDF2 and Pdfminer

    PyPDF2

In [16]:
import PyPDF2

pdfFileObj = open('../resume/resume_skills_example.pdf', 'rb')

pdfReader = PyPDF2.PdfReader(pdfFileObj)

extracted_text = ""
for page in pdfReader.pages:
    extracted_text += page.extract_text().strip()  

print(extracted_text)

pdfFileObj.close()

SKILLS  
 
● Advanced proficiency in SQL, Java, P ython and Apache Spark.  
● Machine learning and Data Science: Scikit -learn and TensorFlow.  
● Experience with data visualization and reporting tools such as Tableau  and Flask . 
● Strong analytical and problem -solving skills .


As we can see, there is a whitespace issue in the extract_text() method of PyPDF2.

Examples: P ython, Scikit -learn.

To solve this problem, we can use pdfminer which is more accurate in text extraction.

In [15]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

def pdf_miner(file_path):
    output_string = StringIO()
    with open(file_path, "rb") as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    resume_txt = output_string.getvalue()  # str type
    return resume_txt

In [18]:
output_string = StringIO()

with open('../resume/resume_skills_example.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

extracted_text = output_string.getvalue()
print(extracted_text)

SKILLS 

●  Advanced proficiency in SQL, Java, Python and Apache Spark. 
●  Machine learning and Data Science: Scikit-learn and TensorFlow. 
●  Experience with data visualization and reporting tools such as Tableau and Flask. 
●  Strong analytical and problem-solving skills. 

 




    Extracting skills using NLP techniques

In [2]:
!pip install pdfminer.six > /dev/null 2>&1

In [28]:
import json
import re
from pdfminer.high_level import extract_text
import spacy

nlp = spacy.load("en_core_web_lg")

def load_skills_from_jsonl(file_path):
    skills_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            skill_entry = json.loads(line.strip())
            # Extract skill from "pattern" key
            skill_name = " ".join([token["LOWER"] for token in skill_entry["pattern"]])
            skills_set.add(skill_name)
    return skills_set


def extract_text_from_pdf(file_path):
    return extract_text(file_path)


def preprocess_text(text):
    """Applies NLP preprocessing: lowercasing, punctuation removal, lemmatization, stopword removal."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    
    # Tokenization, lemmatization, and stopword removal
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return set(tokens)


def extract_skills_from_resume(file_path, predefined_skills):
    raw_text = extract_text_from_pdf(file_path)
    processed_tokens = preprocess_text(raw_text)
    
    matched_skills = processed_tokens.intersection(predefined_skills)
    return matched_skills


predefined_skills = load_skills_from_jsonl("../data/skill_patterns.jsonl")
matched_skills = extract_skills_from_resume("../resume/Junior_software_developer.pdf", predefined_skills)

# print("Extracted Skills:", matched_skills)
for skill in matched_skills:
    print(skill)



sql
c
security
software
testing
mobile
net
php
server
specification
support
business
design
workflow
