In [None]:
!pip install plac

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plac
  Downloading plac-1.3.5-py2.py3-none-any.whl (22 kB)
Installing collected packages: plac
Successfully installed plac-1.3.5


In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
import pickle
from tqdm import tqdm
import json

In [None]:
import json
import re

# Data Preprocessing steps, converting data into proper format to feed into ner model updation

def data_preprocessing(json_filepath):
    training_data = []
    lines=[]
    with open(json_filepath, 'r') as f:
        lns = f.readlines()

    for ln in lns:
        resume_data = json.loads(ln)
        text = resume_data['content'].replace("\n", " ")
        entities = []
        data_annot = resume_data['annotation']
        if data_annot is not None:
            for annot in data_annot:
                #only a single point in text annotation.
                point = annot['points'][0]
                labels = annot['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for lbl in labels:
                    pnt_start = point['start']
                    pnt_end = point['end']
                    pnt_text = point['text']

                    lstrip_diff = len(pnt_text) - len(pnt_text.lstrip())
                    rstrip_diff = len(pnt_text) - len(pnt_text.rstrip())
                    if lstrip_diff != 0:
                        pnt_start = pnt_start + lstrip_diff
                    if rstrip_diff != 0:
                        pnt_end = pnt_end - rstrip_diff
                    entities.append((pnt_start, pnt_end + 1 , lbl))
        training_data.append((text, {"entities" : entities}))
    return training_data

def cure_entity(data: list) -> list:
    """clears out leading and trailing white spaces from entity spans.

    Args:
        data (list): json data that is intended to be cured.

    Returns:
        list: The cured data.
    """
    invalid_tokens = re.compile(r'\s')

    cleaned_data = []
    for txt, annot in data:
        entities = annot['entities']
        vld_ent = []
        for start, end, label in entities:
            vld_start = start
            vld_end = end
            while vld_start < len(txt) and invalid_tokens.match(txt[vld_start]):
                vld_start += 1
            while vld_end > 1 and invalid_tokens.match(txt[vld_end - 1]):
                vld_end -= 1
            vld_ent.append([vld_start, vld_end, label])
        cleaned_data.append([txt, {'entities': vld_ent}])
    return cleaned_data

In [None]:
TRAIN_DATA = cure_entity(data_preprocessing(r"/content/drive/MyDrive/python_project/Entity Recognition in Resumes.json"))
TRAIN_DATA[0]

print(f"Training data consist of {len(TRAIN_DATA)} manually labelled resume's.")


Training data consist of 220 manually labelled resume's.


In [None]:
model = None
output_dir=Path(r"/content/drive/MyDrive/python_project/test_ner")
n_iter=100

In [None]:
#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


In [None]:
from spacy.training.example import Example

i=0
for _, annotations in TRAIN_DATA:
  # print(i)
  print(annotations)
  for ent in annotations.get('entities'):
    try:
      ner.add_label(ent[2])
      i=i+1
    except Exception as e:
      print("******************")
      print(e)
      print("******************")

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()

    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
            for text, annotations in batch:
              try:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.3)
              except Exception as e:
                pass
                # print("****** Exception in updating model",e)
        print(losses)
  

{'entities': [[1296, 1622, 'Skills'], [993, 1154, 'Skills'], [939, 957, 'College Name'], [883, 905, 'College Name'], [856, 860, 'Graduation Year'], [771, 814, 'College Name'], [727, 769, 'Designation'], [407, 416, 'Companies worked at'], [372, 405, 'Designation'], [95, 145, 'Email Address'], [60, 69, 'Location'], [49, 58, 'Companies worked at'], [13, 46, 'Designation'], [0, 12, 'Name']]}
{'entities': [[1155, 1199, 'Email Address'], [743, 1141, 'Skills'], [729, 733, 'Graduation Year'], [675, 702, 'College Name'], [631, 673, 'Degree'], [625, 629, 'Graduation Year'], [614, 623, 'College Name'], [606, 612, 'Degree'], [104, 148, 'Email Address'], [62, 68, 'Location'], [0, 14, 'Name']]}
{'entities': [[3749, 3757, 'Skills'], [3709, 3718, 'Skills'], [3664, 3672, 'Skills'], [3636, 3645, 'Skills'], [3542, 3550, 'Skills'], [3527, 3530, 'Skills'], [3510, 3515, 'Skills'], [3489, 3498, 'Skills'], [3468, 3478, 'Skills'], [3421, 3458, 'College Name'], [3381, 3419, 'Degree'], [1664, 1673, 'Location'], 



{'ner': 10739.286845210352}
{'ner': 4938.460135163542}
{'ner': 4417.893338599165}
{'ner': 3333.700697833684}
{'ner': 2855.085361391034}
{'ner': 2740.407955255928}
{'ner': 2243.0908151734143}
{'ner': 2461.658354243812}
{'ner': 2305.325987665104}
{'ner': 1916.448057200595}
{'ner': 1852.990737471558}
{'ner': 1901.8429085424323}
{'ner': 1694.2369271157095}
{'ner': 1775.7599886192438}
{'ner': 1547.582061654341}
{'ner': 1667.0122209826218}
{'ner': 1562.0509857155384}
{'ner': 1503.0824888864508}
{'ner': 1487.1492547545981}
{'ner': 1476.2527950806018}
{'ner': 1335.0089621290747}
{'ner': 1409.390497160804}
{'ner': 1355.8153336189566}
{'ner': 1301.4566175963748}
{'ner': 1225.2761562523904}
{'ner': 1260.579536423559}
{'ner': 1224.3337080303775}
{'ner': 1297.865789966613}
{'ner': 1225.5479607113457}
{'ner': 1231.5549183076846}
{'ner': 1159.6994835867863}
{'ner': 1174.6022200939735}
{'ner': 1206.3658459546145}
{'ner': 1100.7925226082727}
{'ner': 1235.7227358567143}
{'ner': 1115.0182618399817}
{'ner

In [None]:
for text, _ in TRAIN_DATA:
  print(text)
  doc = nlp(text)
  print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
  break

Sai Vivek Venkatraman Decisive, Data driven and results-oriented professional offering 13 years of experience in Infosys Limited handling and managing Information Technology projects in Telecom domain with the last 3+ years focused on Project Management.  Chennai, Tamil Nadu - Email me on Indeed: indeed.com/r/Sai-Vivek-Venkatraman/ a009f49bfe728ad1  for excellence in project delivery (2015 - 2016)  Optimistic Project Manager with a total experience of 13 years, TECHNOLOGY LEAD / ANALYST accomplished in prioritizing and INFOSYS LIMITED, INDIA & USA delivering projects with competence. December 2008 - January 2015 Data driven decision maker, creative • Key responsibilities included: Requirements Gathering and elucidation, problem solver and a resilient Estimation, Defect Management & Warranty Support, Team / Resource negotiator with a remarkable Management, On-time Escalation, Status reporting and Client understanding of business goals and engagement operational methodologies. • Spear-he

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /content/drive/MyDrive/python_project/test_ner


In [None]:
model = spacy.load('/content/drive/MyDrive/python_project/test_ner')


In [None]:
doc2= nlp("Harshil Bhavsar 2 years of experience as: Store Executive, bachelors in computer science")

print('Entities', [(ent.text, ent.label_) forsty ent in doc2.ents])
# doc2.ents

Entities [('Harshil Bhavsar', 'Name'), ('Store Executive', 'Designation')]
