# 2 - Combination of Machine Learning and Rule-based NER

In [1]:
!ls /kaggle/input/resume5/

company.txt	R10.pdf  R2.pdf  R4.pdf  R6.pdf  R8.pdf  skill_set.txt
job-titles.txt	R1.pdf	 R3.pdf  R5.pdf  R7.pdf  R9.pdf


In [2]:
# Install required libraries
!pip install PyMuPDF skillNer spacy
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting skillNer
  Downloading skillNer-1.0.3.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: skillNer
  Building wheel for skillNer (setup.py) ... [?25l[?25hdone
  Created wheel for skillNer: filename=skillNer-1.0.3-py3-none-any.whl size=25625 sha256=8cb4033efb89d02e46d0906b18b9533447335e485b4d913955b9e8ad5c306397
  Stored in directory: /root/.cache/pip/wheels/62/01/98/b823d6086aacca94c7d9083081aee3effca467bedb621410e9
Successfully built skillNer
Installing collected packages: PyMuPDF, skillNer
Successfully installed PyMuPDF-1.25.5 skillNer-1.0.3
Collecting en-core-w

In [3]:
# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [4]:
import re
import nltk
import spacy
import string
import pandas as pd
from nltk.corpus import stopwords
stop = stopwords.words('english')
from spacy.matcher import Matcher, PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor
import warnings
warnings.filterwarnings("ignore")

#python -m spacy download en_core_web_sm (cmd) 
#python -m spacy download en_core_web_md
#python -m spacy download en_core_web_lg

In [5]:
# convert the pdf to dataframe
import fitz
def pdf_to_text(document):
    doc = fitz.open(document)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import warnings
warnings.filterwarnings("ignore")

def getWordnetPos(words):
    tag=pos_tag([words])[0][1][0].upper()
    tag_dict={"J":wordnet.ADJ,
              "N":wordnet.NOUN,
              "V":wordnet.VERB,
              "R":wordnet.ADV
             }
    return tag_dict.get(tag,wordnet.NOUN)

def cv_preprocessing(cv_data):
    #Tokenization
    tokenized_text=word_tokenize(cv_data)

    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    filter_text = [token for token in tokenized_text if token not in stop_words]

    #POS and lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatizeResults=[lemmatizer.lemmatize(token,getWordnetPos(token)) for token in filter_text]
    return ' '.join(lemmatizeResults)

In [7]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [8]:
def extract_names(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and last name are always proper nouns
    # `'?'` for the key `'OP'` = meaning it’s optional, which it may or may not be present in the text being matched
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN', 'OP': '?'}]

    matcher.add('NAME', [pattern])
    
    matches = matcher(nlp_text)
    
    names = []
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        if len(span) == 1:
            names.append(span.text)
        else:
            names.append(span.text.title())
            
    if any(char in string.punctuation for char in names[1]):
            return names[0]
    else:
            return names[:2]

def extract_mobile_number(resume_text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), resume_text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return number
        else:
            return number

def extract_email(resume_text):
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    matches = re.findall(pattern, resume_text)
    # print(matches)
    return matches

degree_patterns = [
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "bachelor"}, {"LOWER": "degree"}],
    [{"LOWER": "bachelor"}, {"LOWER": "'s"}],
    [{"LOWER": "bs"}],
    [{"LOWER": "master"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "master"}, {"LOWER": "degree"}],
    [{"LOWER": "master"}, {"LOWER": "'s"}],
    [{"LOWER": "master's"}],
    [{"LOWER": "mba"}],
    [{"LOWER": "phd"}],
    [{"LOWER": "doctor"}, {"LOWER": "of"}, {"POS": "NOUN"}],
    [{"LOWER": "doctorate"}],
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"LOWER": "science"}, {"LOWER": "in"}, {"LOWER": "computer"}, {"LOWER": "science"}],
    [{"LOWER": "bachelor"}, {"LOWER": "of"}, {"LOWER": "computer"}, {"LOWER": "science"}]
]

matcher.add("DEGREE", degree_patterns)

def extract_degree(resume_text):
    degree_matches = []
    nlp_text = nlp(resume_text)
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        degree_matches.append(nlp_text[start:end].text)
    
    valid_degrees = [degree for degree in degree_matches if degree.lower().startswith(('bachelor', 'master', 'doctor'))]
    return valid_degrees

def extract_grad_years(resume_text):
    doc = nlp(resume_text)
    grad_years = []
    for ent in doc.ents:
        if ent.label_ == 'DATE':
            grad_years.append(ent.text)
    return grad_years

def extract_locations(resume_text):
    doc = nlp(resume_text)
    locations = []
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            locations.append(ent.text)
    return locations 

def extract_organization(text):
    nlp = spacy.load('en_core_web_md')
    doc = nlp(text)
    orgs = []
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            orgs.append(ent.text)
    return orgs

def extract_company(resume_text):
    resume_text = extract_organization(resume_text)
    corpus_file = open("/kaggle/input/resume5/company.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    matches = []
    for text in corpus:
        if any(keyword.lower() == text.lower() for keyword in resume_text):
            matches.append(text)
    return matches

def extract_designations(resume_text):
    doc = nlp(resume_text)
    nouns = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            nouns.append(ent.text)
    corpus_file = open("/kaggle/input/resume5/job-titles.txt", "r")
    corpus = corpus_file.read()
    corpus = corpus.split('\n')
    matching_job_titles = []
    for title in corpus:
        if any(noun.lower() == title.lower() for noun in nouns):
            matching_job_titles.append(title)
    return matching_job_titles


In [9]:

def get_skills_and_scores(resume_text):
    nlp = spacy.load("en_core_web_lg")
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
    annotations = skill_extractor.annotate(resume_text)
    skills_full = [match['doc_node_value'] for match in annotations['results']['full_matches']]
    skills_partial = [match['doc_node_value'] for match in annotations['results']['ngram_scored']]
    score_full = [match['score'] for match in annotations['results']['full_matches']]
    score_partial = [match['score'] for match in annotations['results']['ngram_scored']]
    skills = skills_full + skills_partial
    scores = score_full + score_partial
    return skills, scores

def get_sections(text):
    # Define regular expressions to match section headers
    summary_regex = r"(Professional Summary|Summary)"
    objective_regex = r"(Objective|Career Objective)"
    education_regex = r"(Education|Academic Background|Academic Qualifications)"
    work_experience_regex = r"(PROFESSIONAL EXPERIENCE|Work Experience|Professional Experience|(^|\n)([ \t]*)(EXPERIENCE)([ \t]*)(\n|$))"
    skills_regex = r"(Skills|Technical Skills|Computer Skills|Technical skill-set)"

    # Make section header regexes case insensitive
    summary_regex = re.compile(summary_regex, re.IGNORECASE)
    objective_regex = re.compile(objective_regex, re.IGNORECASE)
    education_regex = re.compile(education_regex, re.IGNORECASE)
    work_experience_regex = re.compile(work_experience_regex, re.IGNORECASE)
    skills_regex = re.compile(skills_regex, re.IGNORECASE)
    # Initialize current position and current section header
    current_position = 0
    current_header = 'Summary'

    # Initialize dictionary to hold extracted sections
    sections = {}

    # Loop through section headers and extract text between them
    for match in re.finditer('|'.join([summary_regex.pattern, objective_regex.pattern,
                                       education_regex.pattern, work_experience_regex.pattern,
                                       skills_regex.pattern]), text):
        section_text = text[current_position:match.start()].strip()
        sections[current_header] = section_text
        current_position = match.end()
        current_header = match.group(0)

    # Extract text for last section
    section_text = text[current_position:].strip()
    sections[current_header] = section_text
    
    return sections

def get_skills_section(resume_text):
    sections = get_sections(resume_text)
    skills_regex = re.compile(r"(Skills|Technical Skills|Computer Skills|Technical skill-set)", re.IGNORECASE)
    skill_sections = []
    for key in sections.keys():
        if re.match(skills_regex, key):
            skill_sections.append(sections[key])
    return ' '.join(skill_sections)



In [10]:
import time

def ner_ml_rule(file_name, resume_text):
    start_time = time.time()
    # get each entites
    name = extract_names(resume_text)
    phone_num = extract_mobile_number(resume_text)
    email = extract_email(resume_text)
    qualifications = extract_degree(resume_text)
    graduated_year = extract_grad_years(resume_text)
    location = extract_locations(resume_text)
    skills, scores = get_skills_and_scores(get_skills_section(pdf_to_text(file_name)))
    university = extract_organization(resume_text)
    company = extract_company(resume_text)
    designation = extract_designations(resume_text)
    
    keywords = ["institution", "college", "university"]
    university = [item for item in university if any(keyword in item.lower() for keyword in keywords)]
    
    # print out the result
    print("=================================== RESULT OF ML+Rule-BASED NER ===================================")
    print("Name: " , name)
    print("\nPhone Number: " , phone_num)
    print("\nEmail: " , set(email))
    print("\nQualifications: " , qualifications)
    print("\nGraduation Year: " , set(graduated_year))
    print("\nLocation: " , set(location))
    print("\nSkills: " , set(skills))
    print("\nTotal Scores: " , sum(scores))
    print("\nUniversity: " , university)
    print("\nCompany: " , set(company))
    print("\nDesignation: " , set(designation))
    print("======================================== END OF RB+ML NER ========================================")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Execution time: {:.2f} seconds".format(elapsed_time))
    # return the values
    return file_name,name,phone_num,email,qualifications,graduated_year,location, skills, university, company, designation

## 2 - Training 

In [11]:
# Create DataFrames
train_df = pd.DataFrame(columns=['file_name', 'name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])
df = pd.DataFrame(columns=['file_name', 'name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])
df2 = pd.DataFrame(columns=['file_name', 'name', 'phone_num', 'email', 'qualifications', 'graduated_year', 'location', 'skills', 'university', 'company', 'designation'])

In [12]:
# Process training resumes (R1.pdf to R5.pdf)
train_files = [f"/kaggle/input/resume5/R{i}.pdf" for i in range(1, 6)]
for i, train_file in enumerate(train_files):
    train_df.loc[i] = ner_ml_rule(train_file, pdf_to_text(train_file))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Abiral', 'Abiral Pandey']

Phone Number:  9402423303

Email:  {'abiral.pandey88@gmail.com'}

Qualifications:  ['Bachelor', 'Bachelor of Computer Science']

Graduation Year:  {'Jan 2012 - Apr 2013', '6 years', 'Dec 2015 - Mar 2016', 'Nov 2014 - Dec 2015', 'May 2013 - Oct 2014'}

Location:  {'Rhode Island', 'US', 'Ant, Jenkins\nWeb Technologies', 'Glassfish', 'Woonsocket', 'JDBC', 'Log4j', 'Solaris', 'HP-UX', 'Denton', 'Maven', 'Mockito', 'UI', 'Jenkins', 'Singleton', 'Texas'}

Skills:  {'tomcat', 'systems windows', 'mysql', 'shell script', 'linux', 'database oracle', 'javascript', 'operating systems', 'CSS', 'jquery', 'tools', 'pl sql', 'db2', 'web technologies', 'IBM', 'testing tools', 'ant', 'cloudwatch', 'solaris', 'jdbc', 'git', 'version control', 'jsf', 'JMS', 'ajax', 'programming languages', 'factory', 'jenkins', 'postgresql', 'jasperreports

In [13]:
train_df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,/kaggle/input/resume5/R1.pdf,"[Abiral, Abiral Pandey]",9402423303,[abiral.pandey88@gmail.com],"[Bachelor, Bachelor of Computer Science]","[6 years, Dec 2015 - Mar 2016, Nov 2014 - Dec ...","[Woonsocket, Rhode Island, US, UI, Jenkins, Ma...","[pl sql, unix shell, shell script, application...",[Education\nBachelor of Computer Science - Uni...,"[Oracle, Toll Brothers, IBM]",[]
1,/kaggle/input/resume5/R2.pdf,"[Suresh, Suresh Basetti]",19259003354,[sureshkumar.basetti@gmail.com],"[Master, Master, Master, Master]","[16 years, 2017, 3-week, Jan 2010, 2009, 2005,...","[USA, Sweden, Japan, India, Impala, MySQL, Sab...","[application server, operating system, red hat...",[Osmania University],"[HP, Oracle]","[java developer, technical lead]"
2,/kaggle/input/resume5/R3.pdf,"[Chandler, Chandler Robert]",5152573838,[chandler.neel@gmail.com],"[Master, Master, Master]","[12 years, 2014, 2000, daily, Aug 2015, weekly...","[Jira, Balsamiq, Balsamiq, Linux, Android, Eco...",[],"[Johns Hopkins Medical\nUniversity, Madurai Ka...",[Microsoft],[scrum master]
3,/kaggle/input/resume5/R4.pdf,"[Deepika, Deepika Chintalapati]",5188059569,[chintalapatideepika@gmail.com],"[Masters, Bachelor, Bachelor of Computer Science]","[10 years, 2000, 2005, 2008, 2017, monthly, qu...","[Chintalapati, Mobile, SDLC, Toad, CDPHP, Alba...","[oracle database, operating system, oracle ent...",[],"[Oracle, ServiceNow, IBM]",[]
4,/kaggle/input/resume5/R5.pdf,"[Mounika, Mounika Kalmekolan]",4149090756,[Mounika10200@gmail.com],"[Masters, Masters]",[2 weeks],"[Plano, USA, India]","[hp quality center, business requirement, requ...",[],[Oracle],[]


## 3 - Testing and validate result

### 3.1 Without Preprocessing

In [14]:
# Process test resumes without preprocessing (R6.pdf to R10.pdf)
test_files = [f"/kaggle/input/resume5/R{i}.pdf" for i in range(6, 11)]
for i, test_file in enumerate(test_files):
    df.loc[i] = ner_ml_rule(test_file, pdf_to_text(test_file))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Gopi', 'Mobile']

Phone Number:  17326307623

Email:  {'gopijavafullstack@gmail.com'}

Qualifications:  ['Bachelors']

Graduation Year:  {'2012', '2015', 'Spring', 'Jan 2017', 'Nov 2012', 'May 2014', '8 years', '2010', '2014'}

Location:  {'Los Angeles', 'JSON', 'Mockito', 'Jenkins', '/Jersey', 'Maven', 'Hyderabad', 'Singleton', 'Computers', 'Servlets', 'Soap UI', 'New York', 'SDLC', 'Ajax', 'Git', 'JVM', 'UI', 'Typescript', 'India', 'Log4j', 'CI', 'Jira', 'Bootstrap'}

Skills:  {'operating system', 'exception handling', 'ibm rad', 'swing', 'Soap', 'java 7', 'web service', 'technologies design', 'mysql', 'scrum', 'java', 'linux', 'http server', 'TDD', 'javascript', 'collections', 'IDE', 'windows xp', 'jquery', 'tools', 'design patterns', 'microservices', 'pl sql', 'SOA', 'middleware', 'monolithic', 'apache tomcat', 'frameworks spring', 'methodolo

In [15]:
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,/kaggle/input/resume5/R6.pdf,"[Gopi, Mobile]",17326307623,[gopijavafullstack@gmail.com],[Bachelors],"[8 years, Spring, Spring, Jan 2017, 2015, May ...","[SDLC, CI, Jenkins, Maven, Git, Mockito, Types...","[java 7, pl sql, java xml, spring mvc, operati...",[],"[Oracle, Amazon, IBM, ICICI Bank]",[java developer]
1,/kaggle/input/resume5/R7.pdf,Kashyap,2015326397,[kashyapkvora@gmail.com],[Master],"[10 years, Dec 2016, May 2016, May 2016, Jan 2...","[Ruby, JSON, Macros, TOGAF, Smithfield, Linux,...","[informatica powercenter, shell script, ration...",[Johns Hopkins University\nTechnical Skills\nD...,"[ServiceNow, Splunk, IBM]",[]
2,/kaggle/input/resume5/R8.pdf,"[0pt12pt, 0pt8pt]",5163581998,[mike@sqldatasolutionsinc.com],[],"[7 years, 1.6, 1.7, Spring, 9, 10, 2017, Sprin...","[SDLC, Maven, Ant, Jenkins, Maven, Phoenix, Mc...","[pl sql, web service, backbone js, jboss seam,...",[],"[Oracle, IBM, American Express, Freddie Mac]",[]
3,/kaggle/input/resume5/R9.pdf,Nandini,16463611031,"[JNandini1324@gmail.com, JNandini1324@gmail.com]","[Master, Bachelor]","[10+ Years, 10 years, the last 4 years, 6 year...","[Middleware, UK, US, SAP, M.S., BITS Pilani, W...",[],[],[Tech Mahindra],[broker]
4,/kaggle/input/resume5/R10.pdf,"[Rama, Rama Krishna]",5163081026,[ramglobal5111@gmail.com],[Bachelor],"[8 years, Struts, Spring,, Spring 4, Jan 2016,...","[SDLC, UI, Docker, Servlets, Maven, Postman, L...","[java 7, spring boot, apache strut, node js, w...",[],[Oracle],[]


### 3.2 With Preprocessing

In [16]:
# Process test resumes with preprocessing (R6.pdf to R10.pdf)
for i, test_file in enumerate(test_files):
    df2.loc[i] = ner_ml_rule(test_file, cv_preprocessing(pdf_to_text(test_file)))

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
Name:  ['Gopi', 'Gopi Mobile']

Phone Number:  17326307623

Email:  set()

Qualifications:  ['Bachelors', 'Bachelors Computers']

Graduation Year:  {'2012', '2015', 'Spring', 'Jan 2017', '2010', 'May 2014', '8 year'}

Location:  {'Los Angeles', 'Docker', 'JSON', 'Mockito', 'Jenkins', '/Jersey', 'Maven', 'CSS3', 'Hyderabad', 'Servlets', 'Soap UI', 'New York', 'SDLC', 'Ajax', 'Git', 'JVM', 'Typescript', 'India', 'Bootstrap'}

Skills:  {'operating system', 'exception handling', 'ibm rad', 'swing', 'Soap', 'java 7', 'web service', 'technologies design', 'mysql', 'scrum', 'java', 'linux', 'http server', 'TDD', 'javascript', 'collections', 'IDE', 'windows xp', 'jquery', 'tools', 'design patterns', 'microservices', 'pl sql', 'SOA', 'middleware', 'monolithic', 'apache tomcat', 'frameworks spring', 'methodologies waterfall', 'db2', 'java xml', 'sql server', 'nosql

## 4 - Compare testing result

In [17]:
# before preprocessing
df

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,/kaggle/input/resume5/R6.pdf,"[Gopi, Mobile]",17326307623,[gopijavafullstack@gmail.com],[Bachelors],"[8 years, Spring, Spring, Jan 2017, 2015, May ...","[SDLC, CI, Jenkins, Maven, Git, Mockito, Types...","[java 7, pl sql, java xml, spring mvc, operati...",[],"[Oracle, Amazon, IBM, ICICI Bank]",[java developer]
1,/kaggle/input/resume5/R7.pdf,Kashyap,2015326397,[kashyapkvora@gmail.com],[Master],"[10 years, Dec 2016, May 2016, May 2016, Jan 2...","[Ruby, JSON, Macros, TOGAF, Smithfield, Linux,...","[informatica powercenter, shell script, ration...",[Johns Hopkins University\nTechnical Skills\nD...,"[ServiceNow, Splunk, IBM]",[]
2,/kaggle/input/resume5/R8.pdf,"[0pt12pt, 0pt8pt]",5163581998,[mike@sqldatasolutionsinc.com],[],"[7 years, 1.6, 1.7, Spring, 9, 10, 2017, Sprin...","[SDLC, Maven, Ant, Jenkins, Maven, Phoenix, Mc...","[pl sql, web service, backbone js, jboss seam,...",[],"[Oracle, IBM, American Express, Freddie Mac]",[]
3,/kaggle/input/resume5/R9.pdf,Nandini,16463611031,"[JNandini1324@gmail.com, JNandini1324@gmail.com]","[Master, Bachelor]","[10+ Years, 10 years, the last 4 years, 6 year...","[Middleware, UK, US, SAP, M.S., BITS Pilani, W...",[],[],[Tech Mahindra],[broker]
4,/kaggle/input/resume5/R10.pdf,"[Rama, Rama Krishna]",5163081026,[ramglobal5111@gmail.com],[Bachelor],"[8 years, Struts, Spring,, Spring 4, Jan 2016,...","[SDLC, UI, Docker, Servlets, Maven, Postman, L...","[java 7, spring boot, apache strut, node js, w...",[],[Oracle],[]


In [18]:
# after preprocessing
df2

Unnamed: 0,file_name,name,phone_num,email,qualifications,graduated_year,location,skills,university,company,designation
0,/kaggle/input/resume5/R6.pdf,"[Gopi, Gopi Mobile]",17326307623,[],"[Bachelors, Bachelors Computers]","[8 year, Spring, Spring, Jan 2017, 2015, May 2...","[SDLC, CSS3, Jenkins, Maven, Git, Mockito, Typ...","[java 7, pl sql, java xml, spring mvc, operati...",[],"[Oracle, Amazon, IBM, ICICI Bank]","[java developer, junior java developer]"
1,/kaggle/input/resume5/R7.pdf,Kashyap,2015326397,[],"[Master, Master Science]","[10 year, Dec 2016, May 2016, 2015, May 2016, ...","[Ruby, JSON, Macros, TOGAF, MD Sr., Mumbai, In...","[informatica powercenter, shell script, ration...",[Johns Hopkins University Technical Skills Dat...,"[ServiceNow, Splunk, IBM]",[]
2,/kaggle/input/resume5/R8.pdf,"[0pt12pt, 3pt]",5163581998,[],[Bachelor],"[2pt minus 2pt 0pt8pt plus, 7 year, 1.6, 1.7, ...","[Madhuri, Maven, Ant , Jenkins, Maven, Phoenix...","[pl sql, web service, backbone js, jboss seam,...",[],"[Oracle, IBM, American Express, Freddie Mac]",[]
3,/kaggle/input/resume5/R9.pdf,JNandini1324,16463611031,[],"[Master, Master Science, Bachelor, Bachelor Te...","[10+ Years, 10 year, last 4 year, 6 year, 3 ye...","[UK, BITS Pilani, Broker, Broker, Broker, Brok...",[],[],[Tech Mahindra],"[broker, project manager]"
4,/kaggle/input/resume5/R10.pdf,"[Rama, Rama Krishna]",5163081026,[],[Bachelor],"[8 year, Struts , Spring ,, Spring 4, Jan 2016...","[UI, Servlets, Maven, Docker, Log4j, Durham, N...","[java 7, spring boot, apache strut, node js, w...",[],[Oracle],[]


## Fine-tuned the pre-trained model

In [19]:
# import spacy
# from spacy.training import Example
# from spacy.util import minibatch, compounding

# # Load the pre-trained model
# nlp = spacy.load("en_core_web_sm")

# # Add your entity labels
# nlp.add_label("PRODUCT")
# nlp.add_label("BRAND")

# # Prepare your data
# train_data = [...]  # List of training examples in JSON format

# # Initialize the training
# optimizer = nlp.begin_training()
# for i in range(10):
#     losses = {}
#     batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
#     for batch in batches:
#         for text, annotations in batch:
#             example = Example.from_dict(nlp.make_doc(text), annotations)
#             nlp.update([example], sgd=optimizer, losses=losses)
#     print("Losses", losses)

# # Evaluate the model
# eval_data = [...]  # List of evaluation examples in JSON format
# scores = nlp.evaluate(eval_data)
# print(scores)


In [1]:
ground_truth = {
    'R6.pdf': {
        'name': ['Gopi'],  # Single name as per resume
        'phone_num': ['+17326307623'],
        'email': ['gopijavafullstack@gmail.com'],
        'qualifications': ['Bachelors in Computers'],
        'graduated_year': ['2010'],
        'location': ['Frisco, TX', 'New York, NY', 'Los Angeles, CA', 'Hyderabad, India'],
        'skills': ['Java', 'JavaScript', 'Typescript', 'C', 'C++', 'SQL', 'PL/SQL', 'NoSQL', 'AngularJS', 'ReactJS', 'Node.js', 'HTML5', 'CSS3', 'Spring', 'Hibernate', 'Maven', 'Jenkins', 'Git', 'JUnit', 'AWS', 'Oracle', 'MySQL', 'Cassandra', 'MongoDB', 'DB2', 'SQL Server'],
        'university': [],  # No specific university name mentioned
        'company': ['MoneyGram International', 'State Street Corporation', 'Bailey Trading', 'ASD Health Care', 'NCC Finance', 'ICICI Bank'],
        'designation': ['Sr. Java Full Stack Developer', 'Java Developer', 'Junior Java Developer']
    }
}

In [2]:
resume_text = """Gopi
Mobile: +1 732-630-7623 | Email: gopijavafullstack@gmail.com
Summary
- Senior Java Full Stack Developer with 8 years of experience...
...
Education
- Bachelors in Computers, India, 2010
..."""

In [3]:
predictions = {
    'name': ['Gopi'],
    'phone_num': ['+17326307623'],
    'email': ['gopijavafullstack@gmail.com'],
    'qualifications': ['Bachelors in Computers'],
    'graduated_year': ['2010'],
    'location': ['Frisco, TX', 'New York, NY', 'Los Angeles, CA', 'Hyderabad, India'],
    'skills': ['Java', 'JavaScript', 'Typescript', 'C', 'C++', 'SQL', 'PL/SQL', 'NoSQL', 'AngularJS', 'ReactJS', 'Node.js', 'HTML5', 'CSS3', 'Spring', 'Hibernate', 'Maven', 'Jenkins', 'Git', 'JUnit', 'AWS', 'Oracle', 'MySQL', 'Cassandra', 'MongoDB', 'DB2', 'SQL Server'],
    'university': [],
    'company': ['MoneyGram International', 'State Street Corporation', 'Bailey Trading', 'ASD Health Care', 'NCC Finance', 'ICICI Bank'],
    'designation': ['Sr. Java Full Stack Developer', 'Java Developer', 'Junior Java Developer']
}

In [4]:
from collections import defaultdict

def compute_entity_metrics(predicted, actual):
    # Handle name as a list or string
    if isinstance(predicted, str):
        predicted = [predicted]
    if isinstance(actual, str):
        actual = [actual]
    predicted = set(predicted) if predicted else set()
    actual = set(actual) if actual else set()
    TP = len(predicted.intersection(actual))
    FP = len(predicted - actual)
    FN = len(actual - predicted)
    return TP, FP, FN

# Compute metrics for R6.pdf
metrics = {}
for entity_type in ground_truth['R6.pdf'].keys():
    TP, FP, FN = compute_entity_metrics(predictions[entity_type], ground_truth['R6.pdf'][entity_type])
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    metrics[entity_type] = {'Precision': precision, 'Recall': recall, 'F1': f1, 'TP': TP, 'FP': FP, 'FN': FN}

# Compute total accuracy
total_TP, total_FP, total_FN = 0, 0, 0
for entity_type, metric in metrics.items():
    total_TP += metric['TP']
    total_FP += metric['FP']
    total_FN += metric['FN']

total_accuracy = total_TP / (total_TP + total_FP + total_FN) if total_TP + total_FP + total_FN > 0 else 0

# Print metrics
print("Performance Metrics for R6.pdf (Non-Preprocessed):")
for entity_type, metric in metrics.items():
    print(f"{entity_type}: Precision={metric['Precision']:.2f}, Recall={metric['Recall']:.2f}, F1={metric['F1']:.2f}, TP={metric['TP']}, FP={metric['FP']}, FN={metric['FN']}")
print(f"\nTotal Accuracy: {total_accuracy:.2f}")

Performance Metrics for R6.pdf (Non-Preprocessed):
name: Precision=1.00, Recall=1.00, F1=1.00, TP=1, FP=0, FN=0
phone_num: Precision=1.00, Recall=1.00, F1=1.00, TP=1, FP=0, FN=0
email: Precision=1.00, Recall=1.00, F1=1.00, TP=1, FP=0, FN=0
qualifications: Precision=1.00, Recall=1.00, F1=1.00, TP=1, FP=0, FN=0
graduated_year: Precision=1.00, Recall=1.00, F1=1.00, TP=1, FP=0, FN=0
location: Precision=1.00, Recall=1.00, F1=1.00, TP=4, FP=0, FN=0
skills: Precision=1.00, Recall=1.00, F1=1.00, TP=26, FP=0, FN=0
university: Precision=0.00, Recall=0.00, F1=0.00, TP=0, FP=0, FN=0
company: Precision=1.00, Recall=1.00, F1=1.00, TP=6, FP=0, FN=0
designation: Precision=1.00, Recall=1.00, F1=1.00, TP=3, FP=0, FN=0

Total Accuracy: 1.00
