In [2]:
import re
import json
import os
import docx
import PyPDF2

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        pdf_reader = PyPDF2.PdfReader(f)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

def extract_text_from_docx(docx_path):
    text = ""
    doc = docx.Document(docx_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_professional_experience(text):
    experience = []

    # Define patterns to match different formats of experience lines
    experience_patterns = [
        r'Work Experience:(.*)',  # Example: Work Experience: Company XYZ, Position: Software Engineer, Duration: Jan 2018 - Dec 2020
        r'Experience:(.*)',       # Example: Experience: Company XYZ, Position: Software Engineer, Duration: Jan 2018 - Dec 2020
        r'Employment:(.*)',       # Example: Employment: Company XYZ, Position: Software Engineer, Duration: Jan 2018 - Dec 2020
        
    ]

    # Iterate through each pattern and extract matches
    for pattern in experience_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            experience.append(match.strip())

    return experience

def extract_information_from_text(text):
    extracted_info = {}

    # Extracting name
    name_pattern = re.compile(r'\b(\w+)\b')  # Match individual words
    name_words = name_pattern.findall(text)
    if len(name_words) >= 2:
        extracted_info["First Name"] = name_words[0]  # First word as first name
        extracted_info["Last Name"] = name_words[1]   # Second word as last name

    # Extracting LinkedIn
    linkedin_pattern = r'.*linkedin.*'
    linkedin_match = re.search(linkedin_pattern, text, re.IGNORECASE)
    extracted_info["linkedin"] = linkedin_match.group(0) if linkedin_match else None
    
    # Extracting Github
    github_pattern = r'https://github.com/[^\s]+'
    github_match = re.search(github_pattern, text, re.IGNORECASE)
    extracted_info["github"] = github_match.group(0) if github_match else None

    # Extracting email
    email_pattern = r"\b\S+@\S+\b"  # Regular expression pattern to find email addresses containing "@"
    email_match = re.search(email_pattern, text)
    if email_match:
        extracted_info["Email"] = email_match.group(0)

    # Extracting phone number
    phone_pattern = r'(?:\+91\s?)?\d{10}'
    phone_match = re.search(phone_pattern, text)
    if phone_match:
        extracted_info["Phone"] = phone_match.group(0)

    # Extracting skills
    skills_pattern = r"Skills: (.+)"
    skills_match = re.search(skills_pattern, text, re.IGNORECASE)
    if skills_match:
        extracted_info["Skills"] = [skill.strip() for skill in skills_match.group(1).split(",")]

    # Extracting professional experience
    extracted_info["Experience"] = extract_professional_experience(text)

    return extracted_info

def extract_information(resume_path):
    _, file_extension = os.path.splitext(resume_path)
    if file_extension == ".pdf":
        text = extract_text_from_pdf(resume_path)
    elif file_extension == ".docx":
        text = extract_text_from_docx(resume_path)
    else:
        raise ValueError("Unsupported file format")

    extracted_info = extract_information_from_text(text)
    return extracted_info

def main():
    # Constructing the file path using os.path.join()
    resume_path = os.path.join(r"C:\Users\ilali\Documents\SQL Server Management Studio\Documents","Resume_Lokesh.pdf") #Paste Resume Path here
    extracted_info = extract_information(resume_path)
    json_data = json.dumps(extracted_info, indent=4)
    print(json_data)

if __name__ == "__main__":
    main()

{
    "First Name": "Lokesh",
    "Last Name": "Pawar",
    "linkedin": "https://www.linkedin.com/in/lokesh -pawar -402096a1/  ",
    "github": null,
    "Email": "ilokeshpawar@gmail.com",
    "Phone": "+91 9340579468",
    "Skills": [
        "Communication",
        "Presentation Skill",
        "Adaptability",
        "Teamwork & Collaboration",
        "Critical"
    ],
    "Experience": []
}
