# Resume-Segmentation using TSHD 

## Data Pre-Processing

In [None]:
import os
import re
import pdfplumber
import string

# Function to convert PDF to text
def pdf_to_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate through each page of the PDF
        text = str()
        for page in pdf.pages:
            # Extract the text from the page
            text += page.extract_text()
        return text


# Function to tokenize lines
def tokenize_lines(text):
    lines = text.split('\n')
    return lines

# Function to clean data
def clean_data(lines, inference=False):
    
    if not inference:
        # Remove punctuations
        lines = [re.sub(f"[{re.escape(string.punctuation)}]", "", line) for line in lines]
        # Remove bulleted and numbered lists
        lines = [re.sub(r'^[\s]*[\d\-\*][\.\)]\s+', '', line) for line in lines]
        
    # Remove multiple spaces
    lines = [re.sub(r' +', ' ', line) for line in lines]
    # Remove blank lines # NOTE: Donot remove blank lines for consistency
    lines = [line for line in lines if line.strip()]
    return lines

# Function to normalize text
def normalize_text(lines):
    lines = [line.lower() for line in lines]
    return lines

# Function to enumerate lines
def enumerate_lines(lines):
    enumerated_lines = []
    for i, line in enumerate(lines):
        enumerated_lines.append((i, line))
    return enumerated_lines

# Function to refine enumerated lines
def refine_enumerated_lines(enumerated_lines, max_heading_length):
    refined_enumerated_lines = []
    for i, line in enumerated_lines:
        # Remove long lines
        if len(line.split()) <= max_heading_length:
            refined_enumerated_lines.append((i, line))
    return refined_enumerated_lines

# Preprocessing pipeline
def preprocess_resume(resume_path_or_text, max_heading_length=5, inference=False):
    # Convert PDF to text
    text = pdf_to_text(resume_path) if os.path.exists(resume_path_or_text) else resume_path_or_text
    # Tokenize lines
    lines = tokenize_lines(text)
    # Clean data
    lines = clean_data(lines, inference)
    # Normalize text
    lines = normalize_text(lines) if not inference else lines
    # Enumerate lines
    enumerated_lines = enumerate_lines(lines) if not inference else lines
    # Refine enumerated lines
    refined_enumerated_lines = refine_enumerated_lines(enumerated_lines, max_heading_length) if not inference else lines
    return refined_enumerated_lines if not inference else '\n'.join(refined_enumerated_lines)

Print Potential Headings

In [None]:
resume_path = '../resume/Utsav Maskey Resume March 2023.pdf'
max_heading_length = 4
refined_enumerated_lines = preprocess_resume(resume_path, max_heading_length)

In [None]:
for line in refined_enumerated_lines:
    print(line)

## Heading Detection

In [None]:
import spacy
from spacy.lang.en import English

# Load spaCy's English tokenizer and the small English model
nlp = spacy.load('en_core_web_sm')
CUE_PHRASES = {
    'profile': ['personal information', 'contact information', 'contact details', 'about me'],
    'experience': ['work experience', 'professional experience', 'employment history'],
    'education': ['educational background', 'academic background', 'education and training', 'academic qualifications', 'education'],
    'skills': ['key skills', 'professional skills', 'technical skills'],
    'certifications': ['certifications', 'certificates', 'awards and certificates', 'licenses'],
    # 'languages': ['languages', 'language skills'],
    'awards': ['awards', 'honors', 'achievements'],
    'interests': ['interests', 'hobbies', 'extracurricular activities'],
    'summary': ['summary', 'professional summary', 'career summary', 'profile'],
    'goal': ['objective', 'career objective', 'career goal', 'employment objective'],
    'military_service': ['military service', 'military experience'],
    'additional_info': ['additional information', 'miscellaneous']
}

CUE_WORDS = {
    'profile': ['name', 'address', 'phone', 'email', 'linkedin', 'github', 'website'],
    'experience': ['experience', 'job', 'position', 'career'],
    'education': ['education', 'degree', 'university', 'college', 'academics'],
    'skills': ['skills', 'proficiencies', 'abilities', 'competencies'],
    'certifications': ['certifications', 'licenses', 'awards'],
    # 'languages': ['language', 'languages', 'proficiency', 'fluency'],
    'awards': ['awards', 'honors', 'achievements'],
    'interests': ['interests', 'hobbies', 'activities'],
    'summary': ['summary', 'profile', 'objective'],
    'goal': ['goal', 'objective'],
    'military_service': ['military'],
    'additional_info': ['additional', 'miscellaneous']
}

def detect_headings(enumerated_lines):
    # Initialize heading dictionary
    headings_dict = {}

    for i, line in enumerated_lines:
        # Skip lines that were already identified as headings
        if i in headings_dict.values():
            continue

        # Tokenize line and lemmatize words using spaCy
        doc = nlp(line)
        words = [token.lemma_.lower() for token in doc]

        # Apply bigram tagging
        bigrams = [f"{doc[i].lemma_.lower()}_{doc[i + 1].lemma_.lower()}" for i in range(len(doc) - 1)]

        for cue_word, unified_heading in CUE_WORDS.items():
            # Lemmatize the words in unified_heading
            unified_heading = [nlp(w)[0].lemma_.lower() for w in unified_heading]
            for word in unified_heading:
                if word in words or word.replace(' ', '_') in bigrams:
                    # If the cue word is found, add it to the headings dictionary
                    if cue_word not in headings_dict:
                        headings_dict[cue_word] = i
                    # If the cue word is already in the headings dictionary, update its index to the earliest occurrence
                    elif headings_dict[cue_word] > i:
                        headings_dict[cue_word] = i

    # Sort the headings by their index
    sorted_headings = dict(sorted(headings_dict.items(), key=lambda x: x[1]))

    # Return the sorted headings as a list
    return sorted_headings

In [None]:
refined_enumerated_lines

In [None]:
# Driver code so far
resume_path = '../resume/Utsav Maskey Resume March 2023.pdf'
refined_enumerated_lines = preprocess_resume(resume_path, max_heading_length=4)
headings_dict = detect_headings(refined_enumerated_lines)
print(headings_dict)

## Segmentation

In [None]:
def extract_segments(resume_text, headings_dict):
    # Initialize segments dictionary
    segments_dict = {}

    # Split the resume text into lines
    lines = resume_text.split('\n')

    # Add the initial segment before the first heading
    segments_dict['profile'] = ' '.join(lines[:list(headings_dict.values())[0]]).strip()

    # Loop through each heading and extract the text content of the segment
    for i, (heading_name, heading_index) in enumerate(headings_dict.items()):
        # Determine the end index of the segment
        if i == len(headings_dict) - 1:
            end_index = len(lines) - 1
        else:
            end_index = list(headings_dict.values())[i+1] - 1

        # Extract the text content of the segment
        segment_lines = lines[heading_index:end_index+1]
        segment_text = ' '.join(segment_lines).strip()

        # Add the segment to the segments dictionary
        segments_dict[heading_name] = segment_text

    return segments_dict

In [None]:
# Extract segments based on the detected headings
segments_dict = extract_segments(preprocess_resume(resume_path, inference=True), headings_dict)

# Print the extracted segments
for heading, segment in segments_dict.items():
    print(heading.upper())
    print(segment)
    print('-----------------')

# Inference / Test

In [None]:
resume_text = '''

Aakansha 



Professional Profile

    • Around 6 years of IT experience with 4 years of experience in the Salesforce.com CRM platform both as Administrator and Developer in Salesforce.com and around 1years of experience in Java/J2EE Technologies.
    • Experience working with salesforce.com sandbox and production environments.
    • Extensive experience in designing, creating and maintaining Salesforce.com Workflow Rules,              Approvals, Email Templates (HTML & Visual Force Templates).
    • Worked on the designing of custom objects, custom fields, role based page layouts, custom Tabs,         custom reports, design of Visual Force Pages, Snapshots, Dashboards.
    • Used Data loader for data management in force.com platform.
    • Expertise in developing, deploying and integrating Salesforce.com CRM solutions.
    • Expertise with J2EE complaints IDE’s such as Eclipse.
    • Implemented and delivered projects under Agile Development Environment and Test Driven Environments with large & small project teams
    • Expertise in complete Software Development Life Cycle (SDLC) like analysis, design and development of Multi-Tier Client/Server and Web based applications.
    • Expertise in SQL, PL/SQL, Oracle 9i and MySQL databases.
    • Experienced in UI Technologies – HTML, Java Script.
    • Experienced in Software development and business modelling of Web applications, Client/Server Systems, Distributed applications and other custom built projects on Windows NT/9x/2000/XP.
    • Possess exposure of working for projects spanning Financial, Business Process Management.
    • Have a good Knowledge of various methodologies like Waterfall, Agile methodology (FAM).
    • Experience in providing functional and technical support to peer Team members.
    • Excellent Client interaction skills and proven experience in working independently as well as in a team.


Technical Skills

  Salesforce Technologies
 Apex Classes, Apex Triggers, Visual Force Pages, Apex Web Services, Workflow & Approvals, Dashboards, Reports, Analytic Snapshots, Custom Objects, APEX Data Loader, Force.com Eclipse IDE Plug-in
  Languages
 Apex, Java, J2EE, C, SOQL, SOSL
  Database Enterprise
     Technologies
 Servlet 2.5, JSP 2.0, JDBC 4.0
  Web Application  Technologies
 SOAP, WSDL, XML
  Web Services
 Apache Tomcat 5.0.24/7.0.21
  Databases
 Oracle, MS Access, Microsoft SQL Server, SQL & PL/ SQL
  Tools/IDE
 Eclipse , Net Beans, SQL Query Analyser, MS Visio
  Mail Clients
MS Outlook
  Operating Systems
 Microsoft Windows


Certifications & Education 

Salesforce Certified Administrator
Certificate No: 17636667

Amberton University, Garland, TX
Major: MBA in Management, GPA: 3.45 
Graduation Date: 02/2017 

Tribhuwan University, Nepal
Bachelor in Business Studies, First Division
Graduation Date: 12/2009                                       


Organizational Experience

Company: Fidelity Investments                                                                                              Nov 2016 - Till Date
Location: Dallas, TX                                                                         
Role: Salesforce Consultant

Responsibilities:

    • Assisted in analysis of the existing functionality to identify the enhancements and gaps in the standard functionality, design and documented the solutions to fill the gaps.
    • Interacted with various business user groups to perform detailed analysis of business and technical      requirements and to design solutions using various standard objects in Salesforce.
    •  Implemented security and sharing rules at object, field, and record level for different users at different levels of organization. Also created various profiles and configured the permissions based on the organizational hierarchy.
    • Worked with various custom as well as standard Salesforce objects like Accounts, Contacts.
    • Developed Custom business logic using Apex Classes, Triggers, Components, Visual Force pages and Controller Classes for various functional needs.
    • Worked with Apex Scheduler to invoke batch Apex classes at regular intervals.
    • Wrote SOQL and SOSL within custom controllers, extensions and triggers.
    • Created Visual force pages that could be rendered as PDFs, build dashboard components and define email templates.
    • Created Workflows, Approval processes and developed validation rules.
    • Used Pick lists, Dependent Pick lists, and Record Types to enforce data quality.
    • Created Work flow rules and defined related tasks, email alerts and field updates.
    • Utilized various third party AppExchange tools for Reporting and Data handling.
    • Designed and modified Approval processes and created Approval steps which used email alerts and field updates.
    • Monitored and resolved the Service Requests, specifically, visibility issues involving Custom Profiles, Public Groups, and Sharing Rules.
    • Created as well as Customized Various Dashboards and Reports with Salesforce CRM.
    •  Used Agile methodology to achieve high performance.
    • Deployed meta-data changes and Apex classes from a Developer organization to a production organization using Change Sets.
    • Created Many-to-Many relationships and created Junction objects to implement Roll-up Summary fields to aggregate data from child records on the parent.
    • Supported data migration using Sales force Data Loader.
    • Designed Validation Rules along with Roll-Up Summary Fields to maintain data quantity and data consistency.

Environment: Force.com IDE, Apex Classes, Java, Apex Triggers, Visual Force pages, Validation Rules, Formula Fields, Data Loader, Reports and Dashboards, HTML, Lightning components, Marketing Cloud, Exact Target, Service Cloud, Workflow & Approvals, Web Services. AppExchange, Workflow & Approvals, Reports, Custom Objects.
 

		

Company: Eagle Creek Software Services                                                                       Sep2015 -Oct2016
Location: Boston MA
Role: Salesforce Admin                                                                             

Responsibilities:

    • Worked on Standard objects such as leads, Opportunities, Accounts, Contacts, Campaigns associated with Sales Cloud.
    • Involved in Salesforce.com Application Set up activities and customized the applications to match the functional needs of the organization.
    • Developed various custom objects, Tabs, Components and Visual Force pages and Controllers.
    • Created and deployed several reports using force.com platform.
    • Customized User Roles, Role hierarchies, Profiles and Sharing settings to ensure that the protected data is available only to the authorized users.
    • Developed Apex classes, Controller Classes, Apex triggers and API integration for various functional needs in the application.
    • Apex code to convert lead to a contact and associate the contacts with accounts.
    • Developed and deployed approval processes for leads conversion, opportunities and products/Assets management.
    • Implemented the requirements on Sandbox and Force.com IDE plug in for Eclipse and deploying it in production using Change Sets.
    • Created work flow rules and defined related tasks, email alerts, and field updates.
    • Created page layouts, search layouts to organize fields, custom links, related lists and other components on record detail pages and edit pages.
    • Experienced with Web-to-Lead and Web-to-Case scenarios along with Escalation rules and Assignment Rules.
    • Worked on role hierarchy & sharing rules to configure visibility.
    • Maintained Organizational Hierarchy created Profiles, Roles and maintained Sharing/Security for FLS, Record-level Security.
    • Customized the dashboards to track usage for productivity and performance of business centres and their sales team.

Environment: Saleforce.com platform, Force.com Sites, Apex, Visual Force (Pages, Component & 
Controllers), Pages, Data Loader, Work flow & Approvals, Reports, Custom Objects, Custom Tabs, Email Services


Company: Ann Taylor                                                                                                                Mar2014-Sept2015
Location:  New York City, NY                                                                                                
Role: Salesforce Admin

Responsibilities:

    • Worked on Standard objects such as leads, Opportunities, Accounts, Contacts, Campaigns associated with Sales Cloud.
    • Developed various custom objects, Tabs, Components and Visual Force pages and Controllers.
    • Created and deployed several reports using force.com platform.
    • Developed Apex classes, Controller Classes, Apex triggers and API integration for various functional needs in the application.
    • Apex code to convert lead to a contact and associate the contacts with accounts.
    • Developed and deployed approval processes for leads conversion, opportunities and products, and Assets.
    • Implemented the requirements on Sandbox and Force.com IDE plug in during Eclipse and deploying it in production.
    • Implemented the integration of Salesforce to Oracle Eloqua for marketing.
    • Created work flow rules and defined related tasks, email alerts, and field updates.
    • Created page layouts, search layouts to organize fields, custom links, related lists and other components on record detail pages and edit pages.
    • Experienced with Web-to-Lead and Web-to-Case scenarios along with Escalation rules and Assignment Rules.
    • Customized the dashboards to track usage for productivity and performance of business centres and their sales team.

Environment: Saleforce.com platform, Force.com Sites, Apex, Visual Force (Pages, Component & 
Controllers), Pages, Data Loader, Workflow & Approvals, Reports, Custom Objects, Custom Tabs. 


Company:  Acquia                                                                                                          Jan 2012 – Dec2013
Location: Burlington, MA                                                                      
Role: Java Developer

Responsibilities:
    • Responsible for enhancing existing user interface functionality built using HTML, CSS, JavaScript & XML.
    • Participated in data base design and writing complex queries to read the data from MS SQL Server.
    • Developed front-end using HTML and performed validations using Java Script.
    • Implemented unit testing. 
    • Developed test scripts for the other developers based on the module they are working.
    • Used CVS for Source Control and Version Management.
    • Wrote the code for the new and existing programs as per coding standards.
    • Communicated with the testing team in writing test plans and test cases and execution of test cases.
Environment: Java, JavaScript, SQL Server, CVS, Struts, Eclipse, HTML, CSS.




'''

In [30]:
refined_enumerated_lines = preprocess_resume(resume_text, max_heading_length=4)
headings_dict = detect_headings(refined_enumerated_lines)
print(headings_dict)

# Extract segments based on the detected headings
segments_dict = extract_segments(resume_text, headings_dict)

# Print the extracted segments
for heading, segment in segments_dict.items():
    print(heading.upper())
    print(segment)
    print('-----------------')

## Kushal Resume

In [None]:
resume_path = '../resume/KushalRajSharmaResume.pdf'
refined_enumerated_lines = preprocess_resume(resume_path, max_heading_length=4)
headings_dict = detect_headings(refined_enumerated_lines)
print(headings_dict)

# Extract segments based on the detected headings
segments_dict = extract_segments(preprocess_resume(resume_path, inference=True), headings_dict)

# Print the extracted segments
for heading, segment in segments_dict.items():
    print(heading.upper())
    print(segment)
    print('-----------------')

# Resume from Internet

In [None]:
resume_path = '../resume/resume-sample.pdf'
refined_enumerated_lines = preprocess_resume(resume_path, max_heading_length=4)
headings_dict = detect_headings(refined_enumerated_lines)
print(headings_dict)

# Extract segments based on the detected headings
segments_dict = extract_segments(preprocess_resume(resume_path, inference=True), headings_dict)

# Print the extracted segments
for heading, segment in segments_dict.items():
    print(heading.upper())
    print(segment)
    print('-----------------')