### Mounting Google drive


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd '/content/drive/My Drive/five_class_entitydata'

### Imports


In [0]:
import os
import spacy 
import re
import json
import random
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score

### Converting dataturks to spacy format


In [0]:
#converting dataturks annotated data to spacy format to be 
#used as training data

def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

### Cleaning data


In [0]:
############################Removes leading and trailing white spaces from entity spans.############################
# https://github.com/explosion/spaCy/issues/3558
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

### Training the model


In [0]:
################### Train Spacy NER.###########
def train_spacy():
    TRAIN_DATA = convert_dataturks_to_spacy("/content/drive/My Drive/five_class,\
    _entitydata/traindata_3withmyannotation.json")
    TRAIN_DATA=trim_entity_spans(TRAIN_DATA)
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    # if 'tagger' not in nlp.pipe_names:
    #      nlp.add_pipe(nlp.create_pipe('tagger'))
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

       
       
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(25):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.1,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [0]:
nlp_=train_spacy()

Statring iteration 0
{'ner': 16882.106997960484}
Statring iteration 1
{'ner': 10908.776434217287}
Statring iteration 2
{'ner': 6077.587373414839}
Statring iteration 3
{'ner': 8363.131303432065}
Statring iteration 4
{'ner': 4631.257164553054}
Statring iteration 5
{'ner': 5223.568889697201}
Statring iteration 6
{'ner': 3904.1670692492044}
Statring iteration 7
{'ner': 4233.118298977122}
Statring iteration 8
{'ner': 3326.9928887288224}
Statring iteration 9
{'ner': 4829.5860955535945}
Statring iteration 10
{'ner': 3626.4763339962747}
Statring iteration 11
{'ner': 3477.1512522536004}
Statring iteration 12
{'ner': 2976.898051461118}
Statring iteration 13
{'ner': 3416.6796133147454}
Statring iteration 14
{'ner': 4535.087774518935}
Statring iteration 15
{'ner': 2616.040203681598}
Statring iteration 16
{'ner': 2176.3353705488266}
Statring iteration 17
{'ner': 2250.3050472226787}
Statring iteration 18
{'ner': 2827.64227697353}
Statring iteration 19
{'ner': 2004.3714014867123}
Statring iteration 2

### Saving the Trained model


In [0]:
# save model to output directory (with parcial cleaned data)
def save_model(output_dir):
      nlp_.to_disk(output_dir)
      print("Saved model to", output_dir)


In [0]:
output_dir='./model2'
save_model(output_dir)

Saved model to ./model2


### Loading the trained model instance


In [0]:
 ###################loading the saved model################################
 output_dir='./model2'
 nlp2 = spacy.load(output_dir)

### Testing


In [0]:
##############################preparing the testdata########################
examples = convert_dataturks_to_spacy("3class_test_data.json")
examples=trim_entity_spans(examples)
tp = 0
tr = 0
tf = 0

ta = 0
c = 0


In [0]:
#################testing the model######################
nlp_=nlp2
for text, annot in examples:

    f = open("resume"+str(c)+".txt", "w")
    doc_to_test = nlp_(text)
    d = {}
    for ent in doc_to_test.ents:
        d[ent.label_] = []
    for ent in doc_to_test.ents:
        d[ent.label_].append(ent.text)
        
    if 'Skills' in d:
      skills_=d['Skills']    
      print(f'resume {str(c)} skills {skills_}')
    # print(d.keys())

    #---------------------------      
    for i in set(d.keys()):

        f.write("\n\n")
        f.write(i + ":"+"\n")
        for j in set(d[i]):
            f.write(j.replace('\n', '')+"\n")
    #-----------------------------
    d = {}
    for ent in doc_to_test.ents:
        d[ent.label_] = [0, 0, 0, 0, 0, 0]
    for ent in doc_to_test.ents:
        doc_gold_text = nlp_.make_doc(text)
        gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
        y_true = [ent.label_ if ent.label_ in x else 'Not ' +
                  ent.label_ for x in gold.ner]
        y_pred = [x.ent_type_ if x.ent_type_ ==
                  ent.label_ else 'Not '+ent.label_ for x in doc_to_test]
        if(d[ent.label_][0] == 0):
            # f.write("For Entity "+ent.label_+"\n")
            # f.write(classification_report(y_true, y_pred)+"\n")
            (p, r, f, s) = precision_recall_fscore_support(
                y_true, y_pred, average='weighted')
            a = accuracy_score(y_true, y_pred)
            d[ent.label_][0] = 1
            d[ent.label_][1] += p
            d[ent.label_][2] += r
            d[ent.label_][3] += f
            d[ent.label_][4] += a
            d[ent.label_][5] += 1
    c += 1

resume 0 skills ['Angular', 'IBM Personality Insights', 'IBM Watson']
resume 1 skills ['Machine Learning']
resume 2 skills ['Jquery', 'Python', 'Python', 'Perl', 'Hadoop', 'Django', 'Jquery', 'Apache', 'Javascript', 'XML', 'CSS', 'HTML', 'Python', 'Python', 'Python', 'Python']


  _warn_prf(average, modifier, msg_start, len(result))


resume 3 skills ['Text mining']
resume 4 skills ['machine learning (ML)', 'Python', 'Python', 'R', 'R', 'HDFS', 'Map Reduce', 'NLP -Text Mining']
resume 5 skills ['Business/Data/Predictive/TextAnalytics', 'System Analyst', 'Devops', 'SQL', 'Machine Learning', 'SQL', 'VB', 'Python', 'DB2', 'IBM Power Systems', 'R', 'Watson Analytics', 'SPSS', 'Python', 'Java']
resume 6 skills ['R', 'Python', 'ML', 'R', 'Python', 'Machine Learning', 'Python', 'R', 'Kafka', 'SQL', 'SQL', 'SQL', 'Python', 'Python', 'MySQL', 'Python', 'MySQL', 'JAVA SCRIPT', 'AJAX', 'XML', 'JAVA SCRIPT', 'AJAX']
resume 7 skills ['Machine learning', 'Java/J2EE', 'Machine Learning', 'Python', 'PostgreSQL', 'MySQL']
resume 8 skills ['Machine Learning Modelling', 'Python', 'MBA', 'Python', 'Python']
resume 9 skills ['R', 'Image Processing', 'VC++', 'C++', 'Python', 'C++', 'Python', 'SW', 'C++', 'C', 'C/C++', 'Machine Learning', 'Machine Learning', 'Python']
resume 10 skills ['Python', 'Javascript', 'AngularJS', 'HTML', 'CSS', '

### Validating the pridiction


In [0]:
###########################validating the model##########################
for i in d:
    print("\n For Entity "+i+"\n")
    print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
    print("Precision : "+str(d[i][1]/d[i][5]))
    print("Recall : "+str(d[i][2]/d[i][5]))
    print("F-score : "+str(d[i][3]/d[i][5]))


 For Entity Name

Accuracy : 99.88532110091744%
Precision : 0.9988545291574397
Recall : 0.9988532110091743
F-score : 0.9987388618366561

 For Entity Location

Accuracy : 99.77064220183486%
Precision : 0.9977064220183486
Recall : 0.9977064220183486
F-score : 0.9977064220183486

 For Entity Skills

Accuracy : 99.08256880733946%
Precision : 0.9895468905067463
Recall : 0.9908256880733946
F-score : 0.9898169508022059

 For Entity Education

Accuracy : 100.0%
Precision : 1.0
Recall : 1.0
F-score : 1.0


### matcher


In [0]:
import pandas as pd
from pathlib import Path

nlp_=nlp2

def find_skills(text):
  d = {}
  docx=nlp_(text)
  for ent in docx.ents:
    d[ent.label_] = []
  for ent in docx.ents:
    d[ent.label_].append(ent.text)
  if 'Skills' in d:
    skills_=d['Skills']    
    return skills_
  else:
    return None

### Creating job list


In [0]:
# create jobs list
jobs=[]
job_dir='/content/drive/My Drive/five_class_entitydata/jobs'
pathlist = Path(job_dir).glob('**/*.txt')
for path in pathlist:
    with open (path, "r") as fileHandler:
      job={
          'name':path.name,
           'skills':find_skills(''.join(fileHandler.readlines()))
      }
      jobs.append(job)



In [0]:
print(jobs[1]['name'])
print(jobs[1]['skills'])
print(jobs[2]['name'])
print(jobs[2]['skills'])
print(jobs[3]['name'])
print(jobs[3]['skills'])
print(jobs[4]['name'])
print(jobs[4]['skills'])

dataengineer.txt
['J2EE', 'Oracle Fusion', 'Oracle Cloud', 'Salesforce', 'Devops Android', 'Business Analyst', 'UI Developer', 'DBAs', 'Embedded Systems', '.NET', 'Hadoop', 'SQL Developer', 'Big Data', 'Tableau', 'Networking', 'Etl', 'Informatica', 'Ios', 'Quality Analyst', 'Project Manager', 'Python']
datascientist.txt
['Data Science', 'Python', 'Machine Learning', 'SAS', 'Java', 'Scala', 'Hadoop', 'Hive', 'Bigdata', 'Programming', 'SQL server reporting', 'Msbi', 'Ssrs', 'Msbi', 'Sql', 'Artificial Intelligence', 'Pandas', 'Pyspark', 'Sklearn', 'Flask', 'Django', 'Map Reduce', 'Parametric Design', 'Modeling', 'Regression', 'Patterns', 'Data Mining', 'Text Mining', 'Oops', 'Deep Learning', 'Web Analytics', 'Time Series', 'Regression', 'Tensorflow', 'Azure', 'Linear Regression', 'Logistic Regression', 'Decision Tree', 'Random Forest', 'Data Structure', 'Computer Vision']
javadeveloper.txt
['SQL Server', 'IBM HTTP', 'IBM WebSphere', 'IHS', 'WAS', 'Java EE', 'SQL Server', '.NET core', 'C#'

### Creating cv list


In [0]:
# create cvs list
cvs=[]
cv_dir='/content/drive/My Drive/five_class_entitydata/cv'
pathlist = Path(cv_dir).glob('**/*.txt')
for path in pathlist:
    with open (path, "r") as files:
      cv={
          'name':path.name,
           'skills':find_skills(''.join(files.readlines()))
      }
      cvs.append(cv)

In [0]:
print(cvs[1]['name'])
print(cvs[1]['skills'])
print(cvs[2]['name'])
print(cvs[2]['skills'])
print(cvs[3]['name'])
print(cvs[3]['skills'])
print(cvs[4]['name'])
print(cvs[4]['skills'])
print(cvs[5]['name'])
print(cvs[5]['skills'])

r4.txt
['MySQL', 'PostgreSQL', 'Microsoft Access', 'SQL Server', 'FileMaker', 'Oracle']
r3.txt
['MySQL', 'PostgreSQL', 'Microsoft Access', 'SQL Server', 'FileMaker', 'Oracle', 'RDBMS', 'dBASE']
r2.txt
['MySQL', 'PostgreSQL', 'Microsoft Access', 'SQL Server', 'FileMaker', 'Oracle', 'RDBMS', 'dBASE', 'Clipper', 'FoxPro']
r8.txt
['J2EE', 'Oracle Fusion', 'Oracle Cloud', 'Salesforce', 'Devops Android', 'Business Analyst', 'UI Developer', 'DBAs', 'Embedded Systems', '.NET', 'Hadoop', 'SQL Developer', 'Big Data', 'Tableau', 'Networking']
r15.txt
['Java', 'Scala', 'Hadoop', 'Hive', 'Bigdata', 'Programming', 'SQL server reporting', 'Artificial Intelligence', 'Pandas', 'Pyspark', 'Sklearn', 'Flask', 'Django', 'Map Reduce', 'Parametric Design', 'Modeling', 'Regression', 'Patterns', 'Data Mining', 'Text Mining', 'Oops', 'Deep Learning', 'Web Analytics', 'Time Series', 'Regression', 'Tensorflow', 'Azure', 'Linear Regression']


### Matching both list cv and jobs


In [0]:
def job_match(text,cv=True):
  skills=find_skills(text)
  matched=[]
  if cv:
    for job in jobs:
      nskill_job=len(job['skills'])
      count=0
      for skill in skills:
        if skill in job['skills']:
          count+=1
      matched.append({
          'name':job['name'],
          'pct':count/nskill_job*100,
          'job_skill':job['skills'],
          'cv_skill':skills

      })
  else:
    for cv in cvs:
      nskill_cv=len(cv['skills'])
      count=0
      for skill in skills:
        if skill in cv['skills']:
          count+=1
      matched.append({
          'name':cv['name'],
          'pct':count/nskill_cv*100,
          'job_skill':cv['skills'],
          'cv_skill':skills

      })
  return matched
      
      

### Finding Most Matching Job


In [0]:
# find most matching jobs
#######################reading the file from folder######################
f = open('/content/drive/My Drive/five_class_entitydata/cv/r1.txt', 'r')
text = f.read()
match_jobs=job_match(text)
match_jobs = sorted(match_jobs, key=lambda k: k['pct'],reverse=True) 

In [0]:
for i in range(3):
  print(f"cv matching with {match_jobs[i]['name']}")
  print(f"{match_jobs[i]['pct']}")

cv matching with backenddeveloper.txt
100.0
cv matching with javadeveloper.txt
10.526315789473683
cv matching with dataengineer.txt
0.0


### Finding Most Matching Resumes


In [0]:
# find most matching cv
#######################reading the file from folder######################
f = open('/content/drive/My Drive/five_class_entitydata/jobs/dataengineer.txt', 'r')
text = f.read()
match_cvs=job_match(text,cv=False)
match_cvs = sorted(match_cvs, key=lambda k: k['pct'],reverse=True) 

In [0]:
for i in range(10):
  print(f"job matching with cv {match_cvs[i]['name']}")
  print(f"{match_cvs[i]['pct']}")

job matching with cv r8.txt
100.0
job matching with cv r9.txt
100.0
job matching with cv r7.txt
100.0
job matching with cv r6.txt
100.0
job matching with cv r10.txt
100.0
job matching with cv r13.txt
5.555555555555555
job matching with cv r12.txt
5.128205128205128
job matching with cv r11.txt
4.878048780487805
job matching with cv r15.txt
3.571428571428571
job matching with cv r14.txt
3.125


### Cleanups


In [0]:
##################################### delete produced resume files
i=10
while i < 30:
  print ("resume"+str(i)+".txt")
  if os.path.isfile("resume"+str(i)+".txt"):
    print ("found")
    path = "resume"+str(i)+".txt" 
    os.remove(path)
    print ("deleted")
    print ("..........")
  else:
    print ("not found")
  i+=1

In [0]:
###################deleting the saved model#################################
#  !rm -rf model2


### xxxxx


In [0]:
###################loading the saved model################################
 output_dir='./model2'
 nlp2 = spacy.load(output_dir)

In [0]:
#######################reading the file from folder######################
f = open('/content/drive/My Drive/five_class_entitydata/feed1.txt', 'r')
text = f.read()
# text="im competent in java,c# and python"
# text=cleandata(text)

In [0]:
docx=nlp2(text)
d = {}
for ent in docx.ents:
  d[ent.label_] = []
for ent in docx.ents:
  d[ent.label_].append(ent.text)
if 'Skills' in d:
  skills_=d['Skills']    
  print(f'Dedected skills {skills_}')

In [0]:
#########################viewving the results####################
from spacy import displacy
displacy.render(nlp_, style='ent',jupyter=True)