# Training Notebook for Resume NER Model

In [1]:
import numpy as np
import pandas as pd
import os
import tika
tika.initVM()
from tika import parser
import json
import re
import spacy
import warnings
warnings.filterwarnings("ignore") 

# Dataset Preprocessing

In [2]:
def ResumeDataset(train_filepath):
    """Resumes Dataset"""
    training_data = []
    lines = []
    
    with open(train_filepath,'r') as f:
        lines = f.readlines()
        
    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n"," ")
        entities = []
        data_annotations = data['annotation']
        
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels,list):
                    labels = [labels]
                
                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']
                    
                    lstrip_diff = len(point_text)-len(point_text.lstrip())
                    rstrip_diff = len(point_text)-len(point_text.rstrip())
                    if lstrip_diff!=0:
                        point_start = point_start+lstrip_diff
                    if rstrip_diff!=0:
                        point_end = point_end-rstrip_diff
                    entities.append((point_start,point_end+1,label))
        training_data.append((text,{"entities" : entities}))
    return training_data

In [3]:
def trim_entity_spans(data:list)->list:
    invalid_span_tokens = re.compile(r'\s')
    
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [4]:
data = trim_entity_spans(ResumeDataset('dataset/train/Entity Recognition in Resumes.json'))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

## Overlapping entities

In [5]:
def clean_entities(training_data):
    
    clean_data = []
    for text, annotation in training_data:
        
        entities = annotation.get('entities')
        entities_copy = entities.copy()
        
        # append entity only if it is longer than its overlapping entity
        i = 0
        for entity in entities_copy:
            j = 0
            for overlapping_entity in entities_copy:
                # Skip self
                if i != j:
                    e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                    # Delete any entity that overlaps, keep if longer
                    if ((e_start >= oe_start and e_start <= oe_end) \
                    or (e_end <= oe_end and e_end >= oe_start)) \
                    and ((e_end - e_start) <= (oe_end - oe_start)):
                        entities.remove(entity)
                j += 1
            i += 1
        clean_data.append((text, {'entities': entities}))
                
    return clean_data

data = clean_entities(data)

# Named Entity Recognition using Spacy

In [6]:
import random

def train_test_split(dataset, test_split):
    random.shuffle(dataset)
    test_idx = int(test_split*len(dataset))
    return dataset[test_idx:],dataset[:test_idx]

In [7]:
train_data,test_data = train_test_split(data, 0.2)

## Training

In [16]:
def train():
    nlp = spacy.blank('en')
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
        
    for _,annotations in train_data:
        for entity in annotations.get('entities'):
            ner.add_label(entity[2])
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itr in range(10):
            print("Starting Iteration : "+str(itr))
            random.shuffle(train_data)
            losses={}
            for text, annotations in train_data:
                nlp.update(
                    [text],
                    [annotations],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses,
                )
            print(losses)
    return nlp

In [17]:
nlp = train()

ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy.pipeline.ner.EntityRecognizer object at 0x7f9db0556e30> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

In [None]:
from spacy.gold import GoldParse
from itertools import groupby

def doc_to_bilou(nlp, text):
    
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        end = last + len(word)
        
        entities.append((
                start,
                end,
                entity
            ))

    gold = GoldParse(nlp(text), entities = entities)
    pred_ents = gold.ner
    
    return pred_ents

y_test = []
y_pred = []

for text, annots in test_data:
    gold = GoldParse(nlp.make_doc(text), entities = annots.get("entities"))
    ents = gold.ner
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)
    
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from itertools import chain

def ner_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    ), accuracy_score(y_true_combined, y_pred_combined)
    
report, accuracy = ner_report(y_test, y_pred)
print(report)

In [None]:
print(accuracy)

### The Model has an F1 Score of 0.89 and an accuracy score of 0.89

# Saving the Model

In [None]:
nlp.to_disk('saved-NER.model')

In [7]:
import spacy
nlp = spacy.load('saved-NER.model')

ConfigValidationError: 

Config validation error
Make sure the sections and values are formatted correctly.

File contains no section headers.
file: '<string>', line: 1
'{\n'