In [9]:
#import libraries
import os
from os import listdir
from os.path import isfile, join
from docx2python import docx2python
import pymupdf,fitz
import platform
import spacy

In [10]:
#output json array
parserResults = []
required_skill={'.net', 'java','llm'}

In [11]:
#load english language model
nlp = spacy.load("en_core_web_lg")
#add pipeline
ruler=nlp.add_pipe("entity_ruler",before="ner")
skills = "skill_pattern.jsonl"
ruler.from_disk(skills)

def processTextUsingSpacy(inputText, fileName):
    doc = nlp(inputText)
    skills=set()
    detected_req_skill_count=0
    name=''
    email=''
    mobile = ''
    
    for ent in doc.ents:
        #print('ent.label:', ent.label_, 'text:', ent.text)
        if(ent.label_ == 'SKILL'):
            skills.add(ent.text.lower())
        if (ent.label_ == 'EMAIL'):
            email = ent.text.lower()
        if(ent.label_ == 'MOBILE'):
            mobile = ent.text
        if(ent.label_ == 'CARDINAL') and len(ent.text) == 10:
            mobile = ent.text
        if (ent.label_ == 'DATE') and len(ent.text) == 10:
            mobile = ent.text
            
    for ent in doc.ents:
        if (ent.label_ == 'PERSON'):
            if(len(ent.text) > 4):
                shortName = ent.text[:3].lower()
                if(shortName in email):
                    name = ent.text
                    
    candidateSelected = True
    for x in required_skill:
        if(not x in skills):
            candidateSelected = False  

    json_object = {
        "name": name,
        "emailid":email,
        "mobile":mobile,
        "status":candidateSelected,
        "skills":skills,
        "file":fileName
    }
    parserResults.append(json_object)

In [13]:
#clear stored results
parserResults = []
#get list of files in resume folder
cwd = os.getcwd()
path = os.path.join(cwd, "resume")
fileList = [f for f in os.listdir(path)]
print('Total Number of files in Resume folder:',len(fileList))
count=0
#convert word or pdf to text and process text using spacy nlp library
for file in fileList:
    if('.docx' in file):
        # extract docx content
        wordfile = os.path.join(cwd, "resume",file)
        with docx2python(wordfile) as docx_content:
            processTextUsingSpacy(docx_content.text, file)
        count+=1
    if('.pdf' in file):
        pdfFile = os.path.join(cwd, "resume",file)
        doc=fitz.open(pdfFile)
        text=""
        for page in doc:
            text = text+str(page.get_text())
        tx ="".join(text.split("\n"))
        doc.close()       
        processTextUsingSpacy(tx,file)
        count+=1
print('Total Number of resumes analyzed:',count)


Total Number of files in Resume folder: 3
Total Number of resumes analyzed: 3


In [14]:
for object in parserResults:
    print("\n")
    print('\033[1m'+'\033[94m'+"------Resume:"+object["file"]+"  NLP Parser Output-------"+'\033[0m')
    if(len(object["skills"]) > 0):
        print(object["name"],'candidate skills:',object["skills"])
    else:
        print(object["name"]," has none of required skills")
    print('required skills:',required_skill)
    print('email:',object["emailid"])
    if(object["mobile"] == ''):
        print("Mobile number not detected.")
    else:
        print('mobile:', object["mobile"])
       
    if object["status"]:
        print('\033[1m'+'\033[92m'+object["name"],' selected'+'\033[0m')
    else:
        print('\033[1m'+'\033[91m'+object["name"],' rejected'+'\033[0m')




[1m[94m------Resume:aswin_resume.docx  NLP Parser Output-------[0m
Aswin Loganathan candidate skills: {'llm', '.net', 'java'}
required skills: {'llm', 'java', '.net'}
email: aswin.l@company.net
mobile: 123 456 7890
[1m[92mAswin Loganathan  selected[0m


[1m[94m------Resume:Data Scientist.pdf  NLP Parser Output-------[0m
Jane Smith  has none of required skills
required skills: {'llm', 'java', '.net'}
email: janesmith@example.com
Mobile number not detected.
[1m[91mJane Smith  rejected[0m


[1m[94m------Resume:yuvaraj_resume.docx  NLP Parser Output-------[0m
 candidate skills: {'.net', 'java'}
required skills: {'llm', 'java', '.net'}
email: yuvaraj@company.net
mobile: 875 456 7890
[1m[91m  rejected[0m


In [None]:
import json
json.dumps(parserResults, default=tuple)