# Spacy Resume Parser

In [None]:
!pip install -U spacy
!pip install spacy_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed spacy-3.5.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy_transformers
  Downloading spacy_transformers-1.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.8/190.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<4.31.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [None]:
spacy.__version__

'3.5.3'

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!nvidia-smi

Fri Jun 16 12:21:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    31W /  70W |    413MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# 200 Resume Annotated Dataset

In [None]:
cv_data = json.load(open('/content/drive/MyDrive/Spacy_Resume/Dataset/train_data.json','r'))

In [None]:
len(cv_data)

200

In [None]:
cv_data[0]

['Govardhana K Senior Software Engineer  Bengaluru, Karnataka, Karnataka - Email me on Indeed: indeed.com/r/Govardhana-K/ b2de315d95905b68  Total IT experience 5 Years 6 Months Cloud Lending Solutions INC 4 Month • Salesforce Developer Oracle 5 Years 2 Month • Core Java Developer Languages Core Java, Go Lang Oracle PL-SQL programming, Sales Force Developer with APEX.  Designations & Promotions  Willing to relocate: Anywhere  WORK EXPERIENCE  Senior Software Engineer  Cloud Lending Solutions -  Bangalore, Karnataka -  January 2018 to Present  Present  Senior Consultant  Oracle -  Bangalore, Karnataka -  November 2016 to December 2017  Staff Consultant  Oracle -  Bangalore, Karnataka -  January 2014 to October 2016  Associate Consultant  Oracle -  Bangalore, Karnataka -  November 2012 to December 2013  EDUCATION  B.E in Computer Science Engineering  Adithya Institute of Technology -  Tamil Nadu  September 2008 to June 2012  https://www.indeed.com/r/Govardhana-K/b2de315d95905b68?isid=rex-

In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/Spacy_Resume/config/base_config.cfg /content/drive/MyDrive/Spacy_Resume/config/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/Spacy_Resume/config/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


# Convert data to spacy format

In [None]:
#file parameter --> file object to write the error data
#data --> actual json data
def get_spacy_doc(file,data):
  # create a balnk file
  nlp= spacy.blank('en')
  db = DocBin()

  #pass the text, annotation data,#tqdm to find the data encoded during the runtime
  for text,annot in tqdm(data):
    #text to doc format
    doc = nlp.make_doc(text)
    #filter the annotated entities
    annot = annot['entities']

    ents = [] #labels
    entity_indices = [] #inde

    #To check the manual annotated json data
    for start,end,label in annot:
      skip_entity = False
      for idx in range(start,end):
        # skip the entities with overlap indicies
        if idx in entity_indices:
          skip_entity=True
          break
      if skip_entity == True:
        continue
      #non-overlapped entities are added to the list
      entity_indices =  entity_indices + list(range(start,end))

      #creates span object for each annotated data
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      #check for the error in the annotated data and write the error into a file
      if span is None:
        err_data = str([start,end]) + "   " + str(text) + "\n"
        file.write(err_data)

      else:
        ents.append(span)

    #if correct add it to the ents list and convert to doc bin object
    try:
      doc.ents=ents
      db.add(doc)
    #if error pass
    except:
      pass

  return db

# Traini, Test Data Split

In [None]:
from sklearn.model_selection import train_test_split
train,test =train_test_split(cv_data,test_size=0.3)

In [None]:
len(train), len(test)

(140, 60)

# Spacy Conversion Code

In [None]:
file = open('/content/drive/MyDrive/Spacy_Resume/model/train.txt','w',encoding = "utf-8")

db = get_spacy_doc(file,train)
db.to_disk('/content/drive/MyDrive/Spacy_Resume/model/train_data.spacy')

db = get_spacy_doc(file,test)
db.to_disk('/content/drive/MyDrive/Spacy_Resume/model/test_data.spacy')

file.close()

100%|██████████| 140/140 [00:02<00:00, 47.96it/s]
100%|██████████| 60/60 [00:00<00:00, 78.02it/s]


In [None]:
len(db.tokens)

60

# Training and Validation

In [None]:
!python -m spacy train /content/drive/MyDrive/Spacy_Resume/config/config.cfg --output /content/drive/MyDrive/Spacy_Resume/model/output --paths.train /content/drive/MyDrive/Spacy_Resume/model/train_data.spacy --paths.dev /content/drive/MyDrive/Spacy_Resume/model/test_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/Spacy_Resume/model/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-06-16 12:50:05,495] [INFO] Set up nlp object from config
[2023-06-16 12:50:05,510] [INFO] Pipeline: ['transformer', 'ner']
[2023-06-16 12:50:05,513] [INFO] Created vocabulary
[2023-06-16 12:50:05,513] [INFO] Finished initializing nlp object
Downloading (…)lve/main/config.json: 100% 481/481 [00:00<00:00, 2.84MB/s]
Downloading (…)olve/main/vocab.json: 100% 899k/899k [00:00<00:00, 2.08MB/s]
Downloading (…)olve/main/merges.txt: 100% 456k/456k [00:00<00:00, 714kB/s]
Downloading (…)/main/tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 6.26MB/s]
Downloading model.safetensors: 100% 499M/499M [00:01<00:00, 261MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initiali