In [1]:
import spacy
import pandas as pd
from spacy import displacy
from spacy.matcher import Matcher
from common import create_patterns
import time

#### LOAD SPACY

In [2]:
nlp = spacy.load("en_core_web_sm")

### use predefined patterns form common file to make matches

In [3]:

matcher = Matcher(nlp.vocab, validate=True)
matcher.add("PROG_LANG", None, *create_patterns())

In [4]:
def parse_train_data(doc):
    detections = [(doc[start:end].start_char, doc[start:end].end_char, 'PROGLANG') for idx, start, end in matcher(doc)]
#Unit test to see actual prog language use below line
    #detections = [(doc[start:end], 'PROGLANG') for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})
unit_test_data='i like python,TENSOR, keras, c++ ,sql,xgboost,python3,golang  C java and golang'
parse_train_data(nlp(unit_test_data))

('i like python,TENSOR, keras, c++ ,sql,xgboost,python3,golang  C java and golang',
 {'entities': [(7, 13, 'PROGLANG'),
   (14, 20, 'PROGLANG'),
   (22, 27, 'PROGLANG'),
   (29, 32, 'PROGLANG'),
   (34, 37, 'PROGLANG'),
   (38, 45, 'PROGLANG'),
   (46, 60, 'PROGLANG'),
   (62, 63, 'PROGLANG'),
   (64, 68, 'PROGLANG'),
   (73, 79, 'PROGLANG')]})

### Load Training Data

In [5]:
from io import StringIO
import docx2txt
from collections import Counter
import os
from os import listdir
from os.path import isfile, join
import PyPDF2
from read_resume import create_profile

In [6]:
#Read resumes from the folder one by one
mypath='C:\\Users\\Resumes' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
#print(onlyfiles)

In [7]:
start=time.time()
i = 0 
TRAIN_DATA=[]
while i < len(onlyfiles):
  file = onlyfiles[i]
  data = create_profile(file,os)
  #print("testing check 1:",data)
  sample_data = [parse_train_data(d) for d in nlp.pipe(data.split('\n')) if len(matcher(d)) >= 1]
  TRAIN_DATA.append(sample_data)
  #print("testing check :",TRAIN_DATA)
  i +=1
print("Total Train Time is :",time.time()-start)

Total Train Time is : 1.3148770332336426


### Convert list of lists form each resume inot one list of tupples that can be fed to the training model

In [14]:
flat_train = [item for sublist in TRAIN_DATA for item in sublist]

In [10]:
# print(type(flat_train))
# print(len(flat_train))
# print(flat_train)

### Training 
#### Create a blank nlp and add ner to it .

In [11]:
def create_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            #print('entity hkkkk: is',ent[2])
            ner.add_label(ent[2])
    return nlp

### Below code is to train the model 1 by one record at a time

In [33]:
import random 
import datetime as dt
import time

start=time.time()
nlp = create_blank_nlp(flat_train)
optimizer = nlp.begin_training()  
for i in range(20):
    random.shuffle(flat_train)
    losses = {}
    for text, annotations in flat_train:
        nlp.update([text], [annotations], sgd=optimizer, losses=losses)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)
print("Total Train Time is :",time.time()-start)

Losses at iteration 0 - 2020-08-02 05:40:10.902739 {'ner': 158.4156836767297}
Losses at iteration 1 - 2020-08-02 05:40:12.872052 {'ner': 19.8173592702547}
Losses at iteration 2 - 2020-08-02 05:40:15.112547 {'ner': 0.018498310957170992}
Losses at iteration 3 - 2020-08-02 05:40:17.089867 {'ner': 3.088156405792692e-05}
Losses at iteration 4 - 2020-08-02 05:40:19.106210 {'ner': 7.338245054324859e-06}
Losses at iteration 5 - 2020-08-02 05:40:21.041501 {'ner': 9.9526526231964e-08}
Losses at iteration 6 - 2020-08-02 05:40:23.035833 {'ner': 6.144349266683931e-08}
Losses at iteration 7 - 2020-08-02 05:40:25.057181 {'ner': 4.616496274321829e-08}
Losses at iteration 8 - 2020-08-02 05:40:27.108548 {'ner': 3.54907001641761e-08}
Losses at iteration 9 - 2020-08-02 05:40:29.128897 {'ner': 2.719819314363398e-08}
Losses at iteration 10 - 2020-08-02 05:40:31.382400 {'ner': 2.2264433337398623e-08}
Losses at iteration 11 - 2020-08-02 05:40:33.384735 {'ner': 1.7047931273029152e-08}
Losses at iteration 12 - 

### Imporvements
#### Use Minibatch  and compounding instead of training one by one record 

In [12]:
from spacy.util import minibatch, compounding
import random 
import datetime as dt

In [15]:
start=time.time()
#nlp = create_blank_nlp(TRAIN_DATA)
nlp = create_blank_nlp(flat_train)
optimizer = nlp.begin_training()
for i in range(20):
    losses = {}
    #batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    batches = minibatch(flat_train, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        #print('_ - - txts- - - - - - ',texts)
        #print('_ - - - - - - - - ',annotations)
        nlp.update(
            texts,  # batch of texts
            annotations,  # batch of annotations
            drop=0.1,  # dropout - make it harder to memorise data
            sgd=optimizer,
            losses=losses,
        )
    print(f"Losses at iteration {i} - {dt.datetime.now()} {losses}")
print("Total Train Time is :",time.time()-start)



Losses at iteration 0 - 2020-08-02 05:46:30.203927 {'ner': 435.5637432448566}
Losses at iteration 1 - 2020-08-02 05:46:30.901393 {'ner': 106.2326471125416}
Losses at iteration 2 - 2020-08-02 05:46:31.567837 {'ner': 60.29971016712079}
Losses at iteration 3 - 2020-08-02 05:46:32.228275 {'ner': 24.72774334828989}
Losses at iteration 4 - 2020-08-02 05:46:32.890717 {'ner': 6.168413556507593}
Losses at iteration 5 - 2020-08-02 05:46:33.552161 {'ner': 1.6118480188478725}
Losses at iteration 6 - 2020-08-02 05:46:34.219605 {'ner': 5.170136307486152e-05}
Losses at iteration 7 - 2020-08-02 05:46:34.875043 {'ner': 3.099047609772414e-06}
Losses at iteration 8 - 2020-08-02 05:46:35.536482 {'ner': 1.6470388001993653e-06}
Losses at iteration 9 - 2020-08-02 05:46:36.185917 {'ner': 2.4822017472202962e-08}
Losses at iteration 10 - 2020-08-02 05:46:36.853361 {'ner': 5.1150892788237106e-06}
Losses at iteration 11 - 2020-08-02 05:46:37.599861 {'ner': 2.0419517963845314e-09}
Losses at iteration 12 - 2020-08-

#### Save Model to Disk

In [16]:
nlp.to_disk("C:\\Users\\customnlp")

### Load the trained Model , perform predicitons and visualize using displacy

In [17]:
cnlp = spacy.load("C:\\Users\\customnlp")

In [67]:

#newdata='i like python,TENSOR, keras, c++ , sql, xgboost, python3, golang  C java and golang'
newdata1='Involved in requirement gathering, analysis, design, estimation and testing of the assigned tasks in openstack.\
Implemented rally openstack C# benchmarking tool on the entire cloud environment.\
Experience in reviewing python code for running the troubleshooting test-cases and bug issues.\
Configuring and managing openstack components such as Keystone, sql, Nova, Neutron, Glance, Swift, xgboost, Heat, keras.\
Written Nova, Java, Neutron, Cinder, Keystone, Hashboard, Swift, Python client api to integrate with existing application.\
Create a strategic architectural design of C platform with networking (VLANs, Firewalls, Load Balancers), Hypervisors (KVM and VMware), workflow and orchestration (Openstack APIS, Smart Cloud Orchestrator), Security (Keystone, lDAP), Inventory and monitoring, licensing, backup/restore.\
Understanding Python files in openstack environment and make necessary changes if needed.\
Involve in the development of the application using Python 3. 3, HTML5, CSS3, AJAX, JSon and JQuery.'
doc = cnlp(newdata1)
#colors={'PROGLANG':'lightblue'}
colors={'PROGLANG':'radial-gradient(yellow,red)'} #yellow inside to outside red
colors={'PROGLANG':'linear-gradient(45deg,orange,red)'}#
options={'ents':['PROGLANG'],'colors':colors}
displacy.render(doc,style='ent',options=options)
#displacy.serve(doc,style='ent',options=options) #open in tab 127.0.0.1:5000



In [30]:
print(type(doc.ents))
skillcount=list(doc.ents)
print("Total skill count",len(skillcount))
print(skillcount)
uniqueskills=set(skillcount)
print("Unique skill count",len(uniqueskills))
print(uniqueskills)

<class 'tuple'>
Total skill count 12
[python, sql, xgboost, keras, Written, Java, Cinder, Python, Python, Python, JSon, JQuery]
Unique skill count 12
{Written, Python, python, sql, Java, Cinder, xgboost, JQuery, Python, Python, JSon, keras}


In [42]:
newdata2='Job Description :\
- Gather requirements, validate architecture, and create and review high-level and low-level design\
- Provide technology consultation and technical solutions for projects/products\
- Create and review architectural decisions of projects/products\
- Ensure compliance of non-functional attributes (stability, security, availability, performance, etc.) of the product\
to internal standards\
- Guide/provide technical training and influence business/technical decisions\
- Own and execute projects independently from an architectural standpoint What you\'ll need :\
- A Bachelor- sMaster\'s degree in Computer Science\
- 6+ years of relevant experience in the technology domain, having worked in Java, Python or similar object-oriented\
language(s)\
- Experience in end-to-end execution and delivery of enterprise grade software\
- Knowledge of design principles, fundamentals of architecture, quality processes and estimation techniques\
- Experience with relational and/or non-relations databases\
- Working proficiency and communication skills in verbal and written English\
- Deep problem-solving abilities and analytical skills to resolve ambiguous requirements\
- Attention to detail and quality, and the ability to work well in and across teams\
- Experience in implementing user interfaces using web-technologies and frameworks, and exposure to delivering highly available,\
scalable large-scale products on cloud platform'

In [43]:
doc1=cnlp(newdata2)
doc1.ents

(products-, Java, Python)

In [49]:
displacy.render(doc1,style='ent')