<a href="https://colab.research.google.com/github/isaacyeos/nlp/blob/master/NER_resume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NER model for extracting relevant fields from resume. The model was trained using Spacy on a dataset of pre-annotated resumes from https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175. This dataset has 220 resumes, of which 200 were used for training and 20 were used for testing. I had to first spend some time cleaning up the annotation dataset as there were many overlaps and spacy does not allow overlap of annotations. Some annotations were also wrong (eg: if someone listed Microsoft Word as a skill, the annotator would annotate Microsoft as the company the person worked at, or if Visual Studio 2005 was listed as a skill, 2005 would be annotated as graduation year). I also had to remove trailing and leading whitespaces from each annotation. 

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import re
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

import spacy
################### Train Spacy NER.###########
def train_spacy():

    TRAIN_DATA = convert_dataturks_to_spacy("gdrive/My Drive/0-software_dev_notes/nlp/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json")
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            if ent[2] not in ner.labels:
              ner.add_label(ent[2])
    print(ner.labels)
    TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp
nlp = train_spacy()

('College Name', 'Companies worked at', 'Degree', 'Designation', 'Email Address', 'Graduation Year', 'Location', 'Name', 'Skills', 'UNKNOWN', 'Years of Experience')
Statring iteration 0
{'ner': 24913.343205252095}
Statring iteration 1
{'ner': 18979.613681942432}
Statring iteration 2
{'ner': 22860.6282382042}
Statring iteration 3
{'ner': 13560.114799581086}
Statring iteration 4
{'ner': 12631.133253704102}
Statring iteration 5
{'ner': 11952.97468677006}
Statring iteration 6
{'ner': 11781.281537752559}
Statring iteration 7
{'ner': 10761.844497440994}
Statring iteration 8
{'ner': 9366.16994616065}
Statring iteration 9
{'ner': 9473.68496954582}


In [0]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


In [0]:
import pdb
def test_spacy(nlp):
    #test the model and evaluate it
    examples = convert_dataturks_to_spacy("gdrive/My Drive/0-software_dev_notes/nlp/Entity-Recognition-In-Resumes-SpaCy-master/testdata.json")
    tp=0
    tr=0
    tf=0

    ta=0
    c=0        
    for text,annot in examples:

        f=open("resume"+str(c)+".txt","w")
        doc_to_test=nlp(text)
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[]
        for ent in doc_to_test.ents:
            d[ent.label_].append(ent.text)

        for i in set(d.keys()):

            f.write("\n\n")
            f.write(i +":"+"\n")
            for j in set(d[i]):
                f.write(j.replace('\n','')+"\n")
        d={}
        for ent in doc_to_test.ents:
            d[ent.label_]=[0,0,0,0,0,0]
        for ent in doc_to_test.ents:
            doc_gold_text= nlp.make_doc(text) #;pdb.set_trace()
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            print(gold)
            y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
            y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]
            print(len(y_pred))
            print (y_pred)
            if(d[ent.label_][0]==0):
                #f.write("For Entity "+ent.label_+"\n")   
                #f.write(classification_report(y_true, y_pred)+"\n")
                (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
                a=accuracy_score(y_true,y_pred)
                d[ent.label_][0]=1
                d[ent.label_][1]+=p
                d[ent.label_][2]+=r
                d[ent.label_][3]+=f
                d[ent.label_][4]+=a
                d[ent.label_][5]+=1
        c+=1
    for i in d:
        print("\n For Entity "+i+"\n")
        print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
        print("Precision : "+str(d[i][1]/d[i][5]))
        print("Recall : "+str(d[i][2]/d[i][5]))
        print("F-score : "+str(d[i][3]/d[i][5]))
test_spacy(nlp)

<spacy.gold.GoldParse object at 0x7f4ff187ad68>
310
['Name', 'Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Name', 'Not Na

  _warn_prf(average, modifier, msg_start, len(result))


<spacy.gold.GoldParse object at 0x7f5077c04748>
766
['Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Email Address', 'Email Address', 'Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Address', 'Not Email Add

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<spacy.gold.GoldParse object at 0x7f4ff187ad68>
1231
['Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'Not Graduation Year', 'No

In [0]:
with open("resume0.txt","r") as f:
  msg = f.read()
  print(msg)



Companies worked at:
Oracle
Accenture


Designation:
Application Development Associate


Name:
Abhishek Jha


College Name:
Kendriya Vidyalaya
B.v.b college of engineering and technology


Degree:
10th
12th in Mathematics
B.E in Information science and engineering


Skills:
C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),Database Management System (Less than 1 year), Java (Less than 1 year)


Location:
Bengaluru



In [0]:
with open("resume10.txt","r") as f:
  msg = f.read()
  print(msg)



Companies worked at:
Accenture


Designation:
Subject matter Expert


Name:
Asish Ratha


Email Address:
indeed.com/r/Asish-Ratha/853988e0e0e236a3


Skills:
Invoice processing, Team handling, new joiners training.sap posting,vendor call attend andresolve the issue,meet SLA tat,working with client tool.
Invoice (5 years), posting. (5 years), TRAINING (4 years)


Location:
Berhampur
Chennai



In [0]:
f1=open("gdrive/My Drive/0-software_dev_notes/interview-prep/RESUME_Dec2017.txt",'r')
text = f1.read()
f=open("resume_isaac.txt","w")
doc_to_test=nlp(text)
d={}
for ent in doc_to_test.ents:
    d[ent.label_]=[]
for ent in doc_to_test.ents:
    d[ent.label_].append(ent.text)

for i in set(d.keys()):

    f.write("\n\n")
    f.write(i +":"+"\n")
    for j in set(d[i]):
        f.write(j.replace('\n','')+"\n")

In [0]:
with open('resume_isaac.txt','r') as f2:
  msg = f2.read()
  print(msg)



Companies worked at:
RESEARCH & PROJECTS
Cyber Security Programme Center


Designation:
Science and Engineering
Software Engineering Intern at


Degree:
Master of Science in Computer Science                Expected start date:


Name:
ISAAC YEO



In [0]:
TRAIN_DATA = convert_dataturks_to_spacy("gdrive/My Drive/0-software_dev_notes/nlp/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json")
# print(TRAIN_DATA)
print(len(TRAIN_DATA))
print(TRAIN_DATA[0])
print(TRAIN_DATA[0][1])

200
('Govardhana K\nSenior Software Engineer\n\nBengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/\nb2de315d95905b68\n\nTotal IT experience 5 Years 6 Months\nCloud Lending Solutions INC 4 Month • Salesforce Developer\nOracle 5 Years 2 Month • Core Java Developer\nLanguages Core Java, Go Lang\nOracle PL-SQL programming,\nSales Force Developer with APEX.\n\nDesignations & Promotions\n\nWilling to relocate: Anywhere\n\nWORK EXPERIENCE\n\nSenior Software Engineer\n\nCloud Lending Solutions -  Bangalore, Karnataka -\n\nJanuary 2018 to Present\n\nPresent\n\nSenior Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2016 to December 2017\n\nStaff Consultant\n\nOracle -  Bangalore, Karnataka -\n\nJanuary 2014 to October 2016\n\nAssociate Consultant\n\nOracle -  Bangalore, Karnataka -\n\nNovember 2012 to December 2013\n\nEDUCATION\n\nB.E in Computer Science Engineering\n\nAdithya Institute of Technology -  Tamil Nadu\n\nSeptember 2008 to June 2012\n\nhttps:/