In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
!pip install -U spacy
!pip install spacy_transformers
!python -m spacy download en_core_web_trf

  _torch_pytree._register_pytree_node(
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-trf
  Attempting uninstall: en-core-web-trf
    Found existing installation: en-core-web-trf 3.7.3
    Uninstalling en-core-web-trf-3.7.3:
      Successfully uninstalled en-core-web-trf-3.7.3
Successfully installed en-core-web-trf-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [30]:
import spacy

import re
import logging
import json

from spacy.training import Example
from spacy.training.example import Example

from spacy.tokens import DocBin
from spacy.tokens import Doc
from tqdm import tqdm

# Preprocessing

In [None]:
# more manageable units or "chunks"
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f:
            lines = f.readlines()

        for line in lines[:]:
            data = json.loads(line)
            text = data['content']
            entities = []
            if data['annotation'] is not None:
                for annotation in data['annotation']:
                    # only a single point in text annotation.
                    point = annotation['points'][0]
                    labels = annotation['label']
                    # handle both list of labels or a single label.
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        # dataturks indices are both inclusive [start, end]
                        # but spacy is not [start, end)
                        entities.append((point['start'], point['end'] + 1, label))

            training_data.append((text, {"entities": entities}))
        return training_data
    except Exception:
        logging.exception("Unable to process " + dataturks_JSON_FilePath)
        return None

In [None]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [None]:
def overlapping_entities(data: list) -> list:
  for text, annotations in data:
      entities = annotations.get("entities", [])
      # Sort entities by start position
      entities.sort(key=lambda ent: ent[0])
      # Remove overlapping entities
      filtered_entities = []
      last_end = 0
      for start, end, label in entities:
          if start >= last_end:  # Check if current entity overlaps with previous
              filtered_entities.append([start, end, label])
              last_end = end  # Update last_end to prevent overlaps
      annotations["entities"] = filtered_entities # Update the original annotations

  return data

In [None]:
def preprocessing(data_file_path):
    data_train = convert_dataturks_to_spacy(data_file_path)
    data_train = trim_entity_spans(data_train)
    data_train = overlapping_entities(data_train)
    return data_train

TRAIN_DATA = preprocessing("/content/drive/MyDrive/4.version/trainning/traindata.json")

In [None]:
len(TRAIN_DATA)

701

In [None]:
TRAIN_DATA[0]

['Afreen Jamadar\nActive member of IIIT Committee in Third year\n\nSangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6\n\nI wish to use my knowledge, skills and conceptual understanding to create excellent team\nenvironments and work consistently achieving organization objectives believes in taking initiative\nand work to excellence in my work.\n\nWORK EXPERIENCE\n\nActive member of IIIT Committee in Third year\n\nCisco Networking -  Kanpur, Uttar Pradesh\n\norganized by Techkriti IIT Kanpur and Azure Skynet.\nPERSONALLITY TRAITS:\n• Quick learning ability\n• hard working\n\nEDUCATION\n\nPG-DAC\n\nCDAC ACTS\n\n2017\n\nBachelor of Engg in Information Technology\n\nShivaji University Kolhapur -  Kolhapur, Maharashtra\n\n2016\n\nSKILLS\n\nDatabase (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n• Programming

# Training

In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/4.version/trainning/cfg/base_config.cfg /content/drive/MyDrive/4.version/trainning/cfg/config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/content/drive/MyDrive/4.version/trainning/cfg/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
def get_spacy_doc(file, data):

  nlp = spacy.blank("en")
  db = DocBin()

  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity == True:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label = label, alignment_mode = 'strict')
      except:
        continue

      if span is None:
        err_data = str([start, end]) + "    " + str(text) + '\n'
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [None]:
from sklearn.model_selection import train_test_split
train, dev = train_test_split(TRAIN_DATA, test_size = 0.2, random_state = 42)

In [None]:
file = open('/content/drive/MyDrive/4.version/trainning/_model/train_file.txt','w')

db = get_spacy_doc(file, train)
db.to_disk('/content/drive/MyDrive/4.version/trainning/_model/train.spacy')

db = get_spacy_doc(file, dev)
db.to_disk('/content/drive/MyDrive/4.version/trainning/_model/dev.spacy')

file.close()

100%|██████████| 560/560 [00:06<00:00, 80.14it/s]
100%|██████████| 141/141 [00:02<00:00, 54.29it/s]


In [None]:
!python -m spacy train /content/drive/MyDrive/4.version/trainning/cfg/config.cfg --output /content/drive/MyDrive/4.version/trainning/_model/output --paths.train /content/drive/MyDrive/4.version/trainning/_model/train.spacy --paths.dev /content/drive/MyDrive/4.version/trainning/_model/dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/4.version/trainning/_model/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  with torch.cuda.amp.autocast(self._mixed_precision):
  with torch.cuda.amp.autocast(self._mixed_precision):
  0       0        1195.82  

# Testing

In [2]:
'''# Installing PyMuPDF for getting the text data from the resume pdf'''
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


In [31]:
fname = '/content/Alice Clark CV.pdf'

In [32]:
import re
import sys, fitz

# Compile the regex pattern to match "Page" followed by a number, "of," and another number
pattern = r"Page \d+ of \d+"

doc = fitz.open(fname)
pages_text = []
# Iterate through each page
for page_num in range(len(doc)):
    # Get the page
    page = doc.load_page(page_num)
    # Extract text from the page
    text = page.get_text()
    # Append the text to the list
    pages_text.append(text)
# Close the document
doc.close()

text = ""
for page in pages_text:
    text = text + str(page)
    page = re.sub(pattern, "", page)
    print(page)

tx = " ".join(text.split('\n'))  # for removing the next line character '/n'
tx1 = tx.replace("/", "\n")
tx2 = tx1.replace("•", "\n\t")
print(tx2.replace(".", "\n"))

Alice Clark 
AI / Machine Learning 
 
Delhi, India Email me on Indeed 
• 
20+ years of experience in data handling, design, and development 
• 
Data Warehouse: Data analysis, star/snow flake scema data modelling and design specific to 
data warehousing and business intelligence 
• 
Database: Experience in database designing, scalability, back-up and recovery, writing and 
optimizing SQL code and Stored Procedures, creating functions, views, triggers and indexes. 
Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL Azure, 
Stream Analytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure data lake 
analytics(U-SQL) 
Willing to relocate anywhere 
 
WORK EXPERIENCE 
Software Engineer 
Microsoft – Bangalore, Karnataka 
January 2000 to Present 
1. Microsoft Rewards Live dashboards: 
Description: - Microsoft rewards is loyalty program that rewards Users for browsing and shopping 
online. Microsoft Rewards members can earn points when searching with Bing, bro

In [34]:
for page in pages_text:
  model = spacy.load('/content/drive/MyDrive/4.version/trainning/_model/output/model-best')
  doc = model(page)
  store = []
  for ent in doc.ents:
    if str(ent) in store:
      continue
    print(f'{ent.label_.upper():{30}} >>>>> {ent.text}')
    store.append(str(ent))
  print("-----")

  _torch_pytree._register_pytree_node(
  self._model.load_state_dict(torch.load(filelike, map_location=device))
  # NB: Previously this was torch.cuda.amp.autocast, passing a boolean


NAME                           >>>>> Alice Clark
DESIGNATION                    >>>>> AI / Machine Learning
LOCATION                       >>>>> Delhi
COMPANIES WORKED AT            >>>>> Microsoft
DESIGNATION                    >>>>> Software Engineer
COLLEGE NAME                   >>>>> Indian Institute of Technology
LOCATION                       >>>>> Mumbai
GRADUATION YEAR                >>>>> 2001
SKILLS                         >>>>> Machine Learning, Natural Language Processing, and Big Data Handling
-----


  self._model.load_state_dict(torch.load(filelike, map_location=device))
  # NB: Previously this was torch.cuda.amp.autocast, passing a boolean


-----
