## Uploading Data

In [1]:
import json
import re

In [2]:
resumes = []
for line in open('Entity Recognition in Resumes.json', 'r'):
    resumes.append(json.loads(line))

## Pre-processing data

In [3]:
training_data = list()
for line in resumes:
    res_text = line['content'].replace("\n", " ")
    res_ent = list()
    annoted_info = line["annotation"]

    if annoted_info is not None:
#         res_labels = list()
        for anno in annoted_info:
            point = anno['points'][0]
            res_labels = anno['label']
            if not isinstance(res_labels, list):
                res_labels = [res_labels]
        
            for label in res_labels:
                point_start = point['start']
                point_end = point['end']
                point_text = point['text']
        
        
                lstrip_diff = len(point_text) - len(point_text.lstrip())
                rstrip_diff = len(point_text) - len(point_text.rstrip())
                if lstrip_diff != 0:
                    point_start = point_start + lstrip_diff
                if rstrip_diff != 0:
                    point_end = point_end - rstrip_diff
                res_ent.append((point_start, point_end + 1 , label))
    training_data.append((res_text, {"entities" : res_ent}))

In [4]:
training_data[0]

("Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [5]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [6]:
cleaned_data = trim_entity_spans(training_data)

In [7]:
cleaned_data[1]

['Afreen Jamadar Active member of IIIT Committee in Third year  Sangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6  I wish to use my knowledge, skills and conceptual understanding to create excellent team environments and work consistently achieving organization objectives believes in taking initiative and work to excellence in my work.  WORK EXPERIENCE  Active member of IIIT Committee in Third year  Cisco Networking -  Kanpur, Uttar Pradesh  organized by Techkriti IIT Kanpur and Azure Skynet. PERSONALLITY TRAITS: • Quick learning ability • hard working  EDUCATION  PG-DAC  CDAC ACTS  2017  Bachelor of Engg in Information Technology  Shivaji University Kolhapur -  Kolhapur, Maharashtra  2016  SKILLS  Database (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT ACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)  ADDITIONAL INFORMATION  TECHNICAL SKILLS:  • Programming Languages: C, C++, Java, .net, php. • Web 

In [8]:
def clean_entities(training_data):
    
    clean_data = []
    for text, annotation in training_data:
       
        entities = annotation.get('entities')
        entities_copy = entities.copy()
       
         # append entity only if it is longer than its overlapping entity
        i = 0
        for entity in entities_copy:
            j = 0
            for overlapping_entity in entities_copy:
                 # Skip self
                if i != j:
                    e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                    # Delete any entity that overlaps, keep if longer
                    if ((e_start >= oe_start and e_start <= oe_end) \
                    or (e_end <= oe_end and e_end >= oe_start)) \
                    and ((e_end - e_start) <= (oe_end - oe_start)):
                        entities.remove(entity)
                j += 1
            i += 1
        clean_data.append((text, {'entities': entities}))
                
    return clean_data

extra_cleaned_data = clean_entities(cleaned_data)

In [9]:
!pip install spacy==2.1.4

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable


## Modeling

In [10]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [11]:
train_data, test_data = train_test_split(extra_cleaned_data, test_size = 0.2, random_state = 42)

### Training SpaCy

In [12]:
import spacy

In [13]:
def train_spacy():
    nlp = spacy.blank('en')
    
    if 'ner' not in nlp.pipe_names:
        res_entity_recog = nlp.create_pipe('ner')
        nlp.add_pipe(res_entity_recog, last=True)
        
    for _, anno in train_data:
        for ent in anno.get("entities"):
            res_entity_recog.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Statring iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [14]:
nlp = train_spacy()

Statring iteration 0
{'ner': 21215.14240076845}
Statring iteration 1
{'ner': 16183.839795593864}
Statring iteration 2
{'ner': 15613.23278257941}
Statring iteration 3
{'ner': 10642.00645072839}
Statring iteration 4
{'ner': 10658.365409684346}
Statring iteration 5
{'ner': 10082.36266356942}
Statring iteration 6
{'ner': 9725.39428014704}
Statring iteration 7
{'ner': 9027.464786986107}
Statring iteration 8
{'ner': 9726.971812602676}
Statring iteration 9
{'ner': 9218.05092095899}


In [15]:
from spacy.gold import GoldParse

In [16]:
from itertools import groupby

In [17]:
def doc_to_bilou(nlp, text):
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        end = last + len(word)
        entities.append((
                    start,
                    end,
                    entity
                ))
    gold = GoldParse(nlp(text), entities = entities)
    pred_ents = gold.ner
    
    return pred_ents

In [18]:
y_test = []
y_pred = []

for text, annots in test_data:
    
    gold = GoldParse(nlp.make_doc(text), entities = annots.get("entities"))
    ents = gold.ner
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)

In [19]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

In [20]:
def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)
    
report, accuracy = ner_report(y_test, y_pred)
print(report)

                       precision    recall  f1-score   support

                    -       0.00      0.00      0.00       428
       B-College Name       0.63      0.78      0.69        54
       I-College Name       0.65      0.75      0.69       103
       L-College Name       0.57      0.70      0.63        54
       U-College Name       1.00      0.50      0.67         6
B-Companies worked at       0.51      0.52      0.52        48
I-Companies worked at       0.01      0.30      0.03        10
L-Companies worked at       0.47      0.48      0.47        48
U-Companies worked at       0.43      0.36      0.39        92
             B-Degree       0.90      0.84      0.87        45
             I-Degree       0.83      0.93      0.88       132
             L-Degree       0.88      0.82      0.85        45
             U-Degree       0.38      0.60      0.46         5
        B-Designation       0.53      0.79      0.64        91
        I-Designation       0.42      0.61      0.50  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Testing the trained model

In [21]:
nlp.to_disk('new_custom_ner')

In [22]:
model = spacy.load('new_custom_ner')

In [23]:
import random
test_doc = training_data[random.randint(0,200)][0]

In [24]:
test_document = model(" ".join(test_doc.split('\n')))
for ent in test_document.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print('\n')

Alok Khandai 0 12 Name


Operational Analyst (SQL DBA) Engineer 13 51 Designation


UNISYS 54 60 Companies worked at


Bengaluru 62 71 Location


indeed.com/r/Alok-Khandai/5be849e443b8f467 105 147 Email Address


3.5 Years 158 167 Years of Experience


Operational Analyst (SQL DBA) Engineer 1472 1510 Designation


UNISYS 1512 1518 Companies worked at


Bengaluru 1522 1531 Location


2016 1551 1555 Graduation Year


Microsoft Corporation 2339 2360 Companies worked at


Microsoft 2809 2818 Companies worked at


Microsoft 4079 4088 Companies worked at


Microsoft Corporation 6199 6220 Companies worked at


B.Tech in Computer Science and Engineering in CSE 7904 7953 Degree


Indira Gandhi Institute Of Technology 7955 7992 College Name


2012 7994 7998 Graduation Year


Database (3 years), SQL (3 years), Sql Dba 8008 8050 Skills


C, C++, PL/SQL 8370 8384 Skills




In [25]:
model.entity.labels

('UNKNOWN',
 'Companies worked at',
 'Degree',
 'College Name',
 'Skills',
 'Email Address',
 'Location',
 'Years of Experience',
 'Designation',
 'Name',
 'Graduation Year')

## As you can see based on the skills, companies worked at and designation one can sort the resumes