In [1]:
import json
import re
import spacy

In [2]:
nlp = spacy.blank("en")

In [3]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]
                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

In [4]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [5]:
data = trim_entity_spans(convert_dataturks_to_spacy("Entity_Recognition_in_Resumes.json"))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [6]:
dataSpacy = []
for element in data:
    text = element[0]
    labelEntities = element[1]["entities"]
    entities = []
    for elementEntities in labelEntities:
        entities.append((elementEntities[0],elementEntities[1],elementEntities[2]))
    dataSpacy.append((text,entities))

In [8]:
print(dataSpacy[0])

("Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [79]:
print(len(dataSpacy))

220


In [5]:
from spacy.tokens import DocBin

In [6]:
db = DocBin()

In [None]:
err_total = 0
for element in data:
    text = element[0]
    labelEntities = element[1]["entities"]
    doc = nlp(text)
    ents = []
    for labelElement in labelEntities:
        start = labelElement[0]
        end = labelElement[1]
        label = labelElement[2]
        span = doc.char_span(start, end, label=label)
        if span is None:
            err_total +=1
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)
# db.to_disk("./train.spacy")

In [76]:
err_total

5

In [9]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.blank("en")

# the DocBin will store the example documents
db = DocBin()
for text, annotations in dataSpacy:
    doc = nlp(text)
    ents = []
    try:
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span is None:
                continue
            ents.append(span)
        doc.ents = ents
    except:
        continue
    db.add(doc)
db.to_disk("./train.spacy")

In [None]:
 # python -m spacy init fill-config base_config.cfg config.cfg
# python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

In [6]:
nlp_ner = spacy.load("model-best")

In [7]:
color_tag = {
    "Skills":"#FFFF66",
    "College Name":"#99FFFF",
    "Degree":"#FFCC66",
    "Name":"#FF9999",
    "Designation":"#0099CC",
    "Companies worked at":"#00CC99",
    "Location":"#CC6666"
}

In [8]:
doc = nlp_ner("Ananya Chavan\nlecturer - oracle tutorials\n\nMumbai, Maharashtra - Email me on Indeed: indeed.com/r/Ananya-\nChavan/738779ab71971a96\n\nSeeking a responsible job with an opportunity for professional challenges and utilize my skills\nup to its extreme.\n\nWORK EXPERIENCE\n\nlecturer\n\nOracle tutorials -  Mumbai, Maharashtra -\n\nApril 2016 to Present\n\nfor computer science (STD 11th and 12th) (2 years)\n➢ Worked at \"Dr.Babasaheb Ambedkar College, Chembur (W) \" as a lecturer for • B.Sc. (Computer\nScience & Information Technology)\n• F.Y.J.C. (Computer Science & I.T.)\n• S.Y.J.C. (Computer Science & I.T.)\n➢ Worked at \"LIVE\" as a Head of the IT Department and Lecturer for Web designing.\n➢ Worked at \"Kohinoor College Of Hotel Management\" as visiting lecturer for SEM I.\n➢ Working at \"ORACLE TUTORIALS\" as a lecturer for computer science (STD 11th and 12th)\n\nEDUCATION\n\nMCA\n\nMumbai University -  Mumbai, Maharashtra\n\nB.Sc. in Com.Sci\n\nMumbai University -  Mumbai, Maharashtra\n\nSKILLS\n\nSEARCH ENGINE MARKETING (2 years), SEM (2 years), ACCESS (Less than 1 year), AJAX (Less\nthan 1 year), APACHE (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTechnical skills:\nLanguages: C, C++, Java (J2EE),\nWeb Component APIS:: Jdbc, Servlet, JSP.\nFrameworks: Spring 4 & Struts 2\nORM Framework: Hibernate\nWeb Development: Html5, CSS3, Java Script, Ajax &JQuery, Angular Js\n\nhttps://www.indeed.com/r/Ananya-Chavan/738779ab71971a96?isid=rex-download&ikw=download-top&co=IN\nhttps://www.indeed.com/r/Ananya-Chavan/738779ab71971a96?isid=rex-download&ikw=download-top&co=IN\n\n\nApplication Servers: Apache Tomcat,\nIDE: Eclipse, Netbeans\nDatabase: Ms-Access, Mysql\nOperating Systems: Windows 7, 8, 10\nFTP Client: Filezilla\nVersioning Tools: Git\n\nProject Details:\n\n\"Real Estate Application\" (Client: Global Realtor PVT. LTD Pune)\nFront-End: Java (J2EE), JDBC, Servlet, JSP, Jquery.\nBack end: Mysql.\nDuration: 6 Month (Internship)\nCompany Name: AryanTech India Pvt. Ltd. Pune\nMy Role: Developer as Trainee.\nModule: Module 4.\nDescription: Developed as a MCA Final SEM Project for\n\"Global Realtors PVT.LTD, Hinjewadi, Pune.\"\nThe Real Estate Web Application is an interactive, effective and revenue-generating website\ndesigned for the Real Estate Industry. The main objective of this application is to help the Real\nEstate Company to display unlimited number of property listings on the website.\n\n\"Beauty Parlor Management System\" (B.Sc. (Com.Sci.))\nTool: VB 6.0\nLanguage: VB\nDatabase: MS-Access\nOperating System: Windows XP\nThe Beauty Parlor Management System is an easy and effective system to use. The main features\nof this system are to avoid manual work and keep storing all appointments of customers.\n\n\"Web Designing Project (Reptiles.com) \" (B.Sc. (Com.Sci.))\nLanguage: HTML and ASP\nTool: Dreamweaver 8.0\nDatabase: MS-Access\nOperating System: Windows XP\nThe Reptiles.com is a simple informative site. The main features of this system are to give all\ninformation of Snakes.")

options = {"colors": color_tag}

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)