In [2]:
# !pip install docx
# !pip install python-docx
# !pip install spacy
# !pip install pandas
# !pip install difflib
# !pip install re
# !pip install collections
import docx
import spacy
import re
import pandas as pd
from collections import defaultdict
from difflib import get_close_matches
import json

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load Spacy model
nlp = spacy.load('en_core_web_sm')# process text, perform tokenization, part-of-speech tagging, named entity recognition, dependency parsing


In [5]:
# Function to extract text from Word document
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [6]:
# Function to preprocess text
def preprocess_text(text):
    doc = nlp(text.lower())
    clean_text = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            clean_text.append(token.lemma_)
    return clean_text

In [7]:
# Load skill dataset from CSV
def load_skill_dataset(skill_file):
    skills_df = pd.read_excel(skill_file)
    # Use set for faster lookups
    return set(skills_df['Skill '].dropna().str.lower().unique())


In [8]:
skills_df = pd.read_excel('//content//drive//MyDrive//resume_project//skillset.xlsx')
print(skills_df.head())
print(skills_df.columns)
#skill_list = skills_df['Skill'].dropna().str.lower().unique().tolist()  # Changed from pd.read_csv() to pd.read_excel() to handle Excel files


                                 Skill 
0           Adobe Systems Adobe Acrobat
1                       AdSense Tracker
2                        Atlassian JIRA
3           Blackbaud The Raiser's Edge
4  ComputerEase Construction Accounting
Index(['Skill '], dtype='object')


In [9]:
# Efficient skill matching using preprocessed text and a set
def extract_skills(text, skill_set):
    resume_tokens = set(preprocess_text(text))  # Use a set for O(1) lookups
    return list(skill_set.intersection(resume_tokens))


In [10]:
# Advanced Section Detection using regex
def classify_section(line):
    line = line.lower()
    if re.search(r"summary|overview", line):
        return "summary"
    elif re.search(r"experience|employment|work history", line):
        return "experience"
    elif re.search(r"education|qualifications", line):
        return "education"
    elif re.search(r"skills|technologies", line):
        return "skills"
    elif re.search(r"projects", line):
        return "projects"
    elif re.search(r"certifications|licenses", line):
        return "certifications"
    return None

In [11]:
# Load job titles from JSON file
# Function to load job titles from JSON and preprocess them into a set
def load_job_title_dataset(job_title_file):
    with open(job_title_file, 'r') as file:
        job_title_data = json.load(file)

    # Preprocess the job titles
    job_title_set = set(preprocess_text(" ".join(job_title_data)))
    return job_title_set

# Efficient job title matching using preprocessed text and a set
def extract_job_title(text, job_title_set):
    resume_tokens = set(preprocess_text(text))  # Use a set for O(1) lookups
    return list(job_title_set.intersection(resume_tokens))

In [12]:
# Function to extract sections using classifier
def extract_sections(text):
    sections = defaultdict(str)
    current_section = None

    # Split text into lines
    lines = text.split("\n")
    for line in lines:
        clean_line = line.strip()
        classified_section = classify_section(clean_line)

        if classified_section:
            current_section = classified_section
        elif current_section:
            sections[current_section] += clean_line + " "

    return sections


In [13]:
# Advanced Named Entity Recognition using Spacy
def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [14]:
# Function to extract education details
def extract_education_details(text):
    education_details = []
    lines = text.split("\n")

    # Define regex patterns for degrees and institutions
    degree_pattern = re.compile(r"\b(Bachelor's|Master's|Ph\.?D|Associate|Diploma|Degree|MSc|MA|MBA|BSc|B.Tech|M.Tech|M.Com|B.Com)\b", re.IGNORECASE)
    institution_pattern = re.compile(r"(university|college|institute|school|academy|polytechnic)", re.IGNORECASE)

    current_education = {}
    for line in lines:
        line = line.strip()

        # Detect Degree
        if re.search(degree_pattern, line):
            current_education['degree'] = line
            continue

        # Detect Institution Name
        if re.search(institution_pattern, line) and 'institution' not in current_education:
            current_education['institution'] = line
            education_details.append(current_education)
            current_education = {}  # Reset for the next entry

    return education_details

In [15]:
def extract_experience_details(text):
    # Split the text into lines for easier processing
    lines = text.split("\n")

    # Define patterns for detecting company names, roles, and dates (durations)
    company_pattern = re.compile(r"\b(?:LLC|Inc|Corporation|Corp|Ltd|Solutions|Technologies|Services|Systems|Consulting|Group|Enterprises)\b|\b[A-Z][a-z]+\b")
    role_pattern = re.compile(r"\b(?:Developer|Engineer|Manager|Inspector|Consultant|Specialist|Administrator|Analyst|Lead|Architect)\b", re.IGNORECASE)
    date_pattern = re.compile(r"(\b\d{4}\b|\bPresent\b)")

    experiences = []
    current_experience = {}
    work_experience_started = False

    for i, line in enumerate(lines):
        line = line.strip()

        # Detect Work Experience section
        if 'work experience' in line.lower():
            work_experience_started = True
            continue

        if work_experience_started:
            # Detect Company Name
            if re.search(company_pattern, line) and not current_experience.get('company'):
                current_experience['company'] = line
                continue

            # Detect Role/Title
            if re.search(role_pattern, line) and not current_experience.get('role'):
                current_experience['role'] = line
                continue

            # Detect Duration
            if re.search(date_pattern, line) and not current_experience.get('duration'):
                current_experience['duration'] = line
                experiences.append(current_experience)
                current_experience = {}

    return experiences


In [16]:
def extract_contact_details(text):
   # phone = re.findall(r'\b\d{10}\b', text)
    phone = re.findall(r'^(\+1|1)?\s*\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$', text)

    email = re.findall(r'\S+@\S+', text)
    linkedin = re.findall(r'(https?://www.linkedin.com[^\s]+)', text)
    github = re.findall(r'(https?://github.com[^\s]+)', text)  # Pattern to capture GitHub URLs

    return {
        "phone": phone,
        "email": email,
        "linkedin": linkedin,
        "github": github
    }

In [28]:
# Main function to parse resume with enhanced logic
def parse_resume_advanced(file_path, skill_file,job_title_file):
    text = extract_text_from_docx(file_path)

    # Load the dynamic skill list
    skill_set = load_skill_dataset(skill_file)

    # Load the job title dataset
    job_title_set = load_job_title_dataset(job_title_file)

    # Extract contact details (including GitHub)
    contact_details = extract_contact_details(text)

    # Extract skills from the resume using dynamic skill dataset
    skills = extract_skills(text, skill_set)

    # Extract detailed experience information (Company, Role from job titles, Duration)
    experience_details = extract_experience_details(text)

    # Extract education details
    education_details = extract_education_details(text)

    # Combine the extracted details
    parsed_data = {
        "contact_details": contact_details,
        "skills": skills,
        "experience_details": experience_details,
        "education_details": education_details  # Adding the education section
    }

    return parsed_data


In [29]:
# Usage Example
resume_file = '/content/drive/MyDrive/resume_project/Resumes_2023-04-26_23-41-27/Batuk Data Scientist (1) (1).docx'
skill_file = '/content/drive/MyDrive/resume_project/skillset.xlsx'
job_title_file = '/content/drive/MyDrive/resume_project/job-titles.json'
parsed_resume = parse_resume_advanced(resume_file, skill_file,job_title_file)
print(parsed_resume)

{'contact_details': {'phone': [], 'email': ['atwork.shukla@gmail.com'], 'linkedin': [], 'github': []}, 'skills': ['python', 'salesforce', 'player', 'route', 'tableau', 'r', 'github', 'confluence', 'sas', 'reduce', 'self', 'skill', 'google', 'mysql'], 'experience_details': [], 'education_details': [{'institution': 'Holds a Bachelor of Engineering in Computer Science from Bhilai Institute of Technology with a GPA of 3.7.'}, {'institution': 'Bhilal institute of Technology, INDIA'}]}


In [25]:
# Serializing json
json_object = json.dumps(parsed_resume, indent=5)

# Writing to sample.json
with open("sample_json", "w") as outfile:
    outfile.write(json_object)
# print(outfile.text)