In [2]:
%pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting Pillow>=9.1 (from pdfplumber)
  Downloading pillow-11.0.0-cp39-cp39-win_amd64.whl.metadata (9.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-win_amd64.whl.metadata (48 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20231228->pdfplumber)
  Downloading cryptography-44.0.0-cp39-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=1.12 (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading cffi-1.17.1-cp39-cp39-win_amd64.whl.metadata (1.6 kB)
Collecting pycparser (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber)
  Downloading pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3

In [5]:
import pdfplumber
import re
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

# Ensure NLTK corpora are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_name(text):
    """Extract name using Named Entity Recognition."""
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        tags = nltk.pos_tag(tokens)
        chunks = ne_chunk(tags)
        for chunk in chunks:
            if isinstance(chunk, Tree) and chunk.label() == 'PERSON':
                return " ".join(c[0] for c in chunk)
    return "Not Found"

def extract_email(text):
    """Extract email using regex."""
    email_match = re.search(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text)
    return email_match.group(0) if email_match else "Not Found"

def extract_qualification(text):
    """Extract qualifications by matching common degree terms."""
    qualifications = re.findall(r'\b(B(?:\.|achelor)?|M(?:\.|aster)?|Ph\.?D|Diploma|High School|HSC|UG|PG|CS|Engineering|Science)\b', text, re.IGNORECASE)
    return ", ".join(set(qualifications)) if qualifications else "Not Found"

def extract_resume_details_nltk(file_path):
    """Extract details using pdfplumber and NLTK."""
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    
    name = extract_name(text)
    email = extract_email(text)
    qualification = extract_qualification(text)
    
    return {"Name": name, "Qualification": qualification, "Email": email}

# Process all resumes
resume_files = ["Resume01.pdf","Resume02.pdf"]  # List of resume file paths
resume_data = []

for file in resume_files:
    details = extract_resume_details_nltk(file)
    resume_data.append(details)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(resume_data)
output_path = "extracted_resume_data_nltk.csv"
df.to_csv(output_path, index=False)

print(f"Data saved to {output_path}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Data saved to extracted_resume_data_nltk.csv
