# Resume Data Extraction Using Python and GenAI

In [2]:
import json
import re
import spacy
import fitz  # PyMuPDF to extract text from PDF
from fuzzywuzzy import fuzz

# Load spaCy model (Make sure it's installed: python -m spacy download en_core_web_trf)
nlp = spacy.load('en_core_web_trf')

def load_text_from_pdf(file_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(file_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text("text")
    return text

def clean_text(text):
    """Clean up the resume text by removing unwanted characters and normalizing spaces."""
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace and newlines
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\s*\n\s*', '\n', text)  # Normalize newline characters
    text = text.strip()
    return text

def extract_email(text):
    """Extract email addresses using a regular expression."""
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, text)
    return emails[0] if emails else "Not Found"

def extract_phone(text):
    """Extract phone numbers using a refined regular expression."""
    phone_pattern = r'\b(?:\+?(\d{1,3})[-.\s]?)?(?:\(?(\d{3})\)?[-.\s]?)?(\d{3})[-.\s]?(\d{4})\b'
    phones = re.findall(phone_pattern, text)
    phone_numbers = ["".join(phone) for phone in phones]  # Join phone number parts
    return phone_numbers[0] if phone_numbers else "Not Found"

def fuzzy_find_section(text, possible_headings):
    """Finds the closest match to a given heading in the text using fuzzy matching."""
    lines = text.split('\n')
    for i, line in enumerate(lines):
        for heading in possible_headings:
            if fuzz.partial_ratio(heading.lower(), line.lower()) > 80:  # Fuzzy match with a threshold
                return i  # Return the line index where the heading is found
    return -1

def extract_section(text, section_start_headings, next_section_headings=None):
    """Generic function to extract a section of text between a start and optional next sections using fuzzy matching."""
    start_idx = fuzzy_find_section(text, section_start_headings)
    if start_idx == -1:
        return ""

    lines = text.split('\n')
    section_lines = []

    if next_section_headings:
        end_idx = fuzzy_find_section("\n".join(lines[start_idx + 1:]), next_section_headings)
        if end_idx != -1:
            end_idx += start_idx + 1  # Adjust relative index to absolute index
        else:
            end_idx = len(lines)  # If no end section found, take till the end of the text
    else:
        end_idx = len(lines)

    for i in range(start_idx + 1, end_idx):
        section_lines.append(lines[i].strip())

    return "\n".join(section_lines).strip()

def preprocess_sections(section_text):
    """Further preprocess extracted sections to remove unwanted content (e.g., email, phone)."""
    section_text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', section_text)  # Remove emails
    section_text = re.sub(r'\b(?:\+?(\d{1,3})[-.\s]?)?(?:\(?(\d{3})\)?[-.\s]?)?(\d{3})[-.\s]?(\d{4})\b', '', section_text)  # Remove phone numbers
    section_text = section_text.strip()
    return section_text

def extract_resume_details(resume_text):
    """Uses spaCy NLP model to extract key details from the resume."""
    resume_text = clean_text(resume_text)  # Clean the resume text
    doc = nlp(resume_text)

    # Extract Name (PERSON entities)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

    # Extract Emails and Phone Numbers
    email = extract_email(resume_text)
    phone = extract_phone(resume_text)

    # Extract Professional Summary, Work Experience, and Education using fuzzy matching
    professional_summary = extract_section(resume_text, ["Professional Summary", "Summary", "Objective"], next_section_headings=["Experience", "Work Experience", "Education"])
    work_experience = extract_section(resume_text, ["Work Experience", "Experience"], next_section_headings=["Education", "Certifications", "Skills"])
    education = extract_section(resume_text, ["Education", "Academic Background"], next_section_headings=["Certifications", "Skills", "Technical Skills"])

    # Preprocess the extracted sections to remove any unwanted content
    professional_summary = preprocess_sections(professional_summary)
    work_experience = preprocess_sections(work_experience)
    education = preprocess_sections(education)

    # Extract certifications (optional, handle cases where this section might not exist)
    certifications = extract_section(resume_text, ["Certifications", "Licenses", "Courses"], next_section_headings=["Skills", "Technical Skills"])

    # Refine skills extraction by filtering out non-skill entities
    skills = list(set([ent.text for ent in doc.ents if ent.label_ in ["SKILL", "TECH", "PRODUCT", "LANGUAGE"]]))
    non_skills = ["Technologies", "Pvt.", "Ltd.", "Institute", "University", "College"]
    skills = [skill for skill in skills if all(ns.lower() not in skill.lower() for ns in non_skills)]

    # Build the extracted details into a structured format
    extracted_details = {
        "PersonalData": {
            "Name": names[0] if names else "Not Found",
            "Email": email,
            "Phone": phone
        },
        "ProfessionalSummary": professional_summary,
        "WorkExperience": work_experience,
        "Education": education,
        "Certifications": certifications if certifications else "Not Found",
        "Skills": skills
    }

    return extracted_details

def save_extracted_data(extracted_data, output_file):
    """Saves extracted data to a JSON file."""
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(extracted_data, json_file, indent=4)

if __name__ == "__main__":
    # Input and Output file paths
    resume_file = r"D:\Python\Brainvire project\Project all data\karan\karan\Resume\1695031952_shrikrishna yadav.pdf"  # Replace with the actual path
    output_file = "extracted_resume123.json"

    # Load resume text from PDF
    resume_text = load_text_from_pdf(resume_file)

    # Extract information using spaCy
    extracted_data = extract_resume_details(resume_text)
    print("Extracted Data: ", extracted_data)

    # Save the structured data to a JSON file
    save_extracted_data(extracted_data, output_file)
    print(f"Extraction complete. Data saved to {output_file}.")


Extracted Data:  {'PersonalData': {'Name': 'SHRIKRISHNA YADAV', 'Email': 'kissna.yadav7@gmail.com', 'Phone': '9137173338'}, 'ProfessionalSummary': '', 'WorkExperience': '', 'Education': '', 'Certifications': 'Not Found', 'Skills': ['Hive', 'Amchart', 'Flask Deep Learning', 'Django', 'Python', 'Hadoop SQL', 'JavaScript', 'Opencv', 'SQL', 'NoSQL Django', 'Sql', 'Computer Vision Azure', 'Pyspark', 'Ajax', 'Pan', 'Hadoop']}
Extraction complete. Data saved to extracted_resume123.json.
