In [6]:
# parser/extractor.py

import fitz  # PyMuPDF
import re
import os
import spacy
import json

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def extract_email(text):
    match = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
    return match[0] if match else None

def extract_phone(text):
    match = re.findall(r"\+?\d[\d\s\-()]{8,}", text)
    return match[0] if match else None

def extract_links(text):
    return re.findall(r"https?://[^\s]+", text)

def extract_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return None

def extract_skills(text):
    keywords = ["python", "sql", "r", "pandas", "excel", "scikit-learn", "power bi", "tableau", "docker"]
    skills_found = [kw.title() for kw in keywords if kw.lower() in text.lower()]
    return list(set(skills_found))

def parse_resume(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    return {
        "name": extract_name(text),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "links": extract_links(text),
        "skills": extract_skills(text)
    }

if __name__ == "__main__":
    os.makedirs("output", exist_ok=True)

parsed = parse_resume("Kanish_Resume_DS.pdf")

with open("output/kanish_resume_parsed.json", "w") as f:
    json.dump(parsed, f, indent=4)

print("✅ Resume parsed and saved as JSON")

✅ Resume parsed and saved as JSON
