In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests
from bs4 import BeautifulSoup # commonly used for parsing HTML content

import re
import spacy
from spacy.tokens import DocBin

In [2]:
!python -m spacy download en_core_web_lg # install the missing model

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Get resume links from link of pages
def get_resume(url_web='https://www.jobspider.com/job/resume-search-results.asp/words_Software%2BEngineer'):
    href_list = []
    print(f'The function takes 30 pages, which is the maximum at the present time of {url_web}')

    for j in range(1, 30):
        url_page = url_web + f'/page_{j}'

        response = requests.get(url_page)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links that we need on the page
        href_locals = soup.find_all('a', attrs={'href': True}, href=lambda value: value.startswith('/job/view-resume-'))
        try:
            for href_local in href_locals:
                href_list.append('https://www.jobspider.com' + href_local['href'])
        except:
            pass

    return href_list

# Convert resume into text
def resume_to_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'lxml')

    # Get job function sought
    resume_title = soup.find_all(["h1"])[0].text.strip()

    # Get job decription
    decriptions = soup.find_all(["td"])
    return url, resume_title, [decription.text.strip() for decription in decriptions]

In [4]:
def preprocess(decription):
    text = "\n".join(decription[19:-3])                                   # omit data after viz
    text = re.sub(r'\n\s*\n', '\n', text)                                 # Replace multiple newlines with one
    text = re.sub(r'\s{2,}', ' ', text)                                   # Replace multiple spaces with one
    text = text.replace('\t', ' ')                                        # Replace tabs with one space
    text = "".join([s for s in text.splitlines(True) if s.strip("\r\n")]) # filters out any empty lines
    text = re.sub('[^A-Za-z0-9\n]+', ' ', text)                           # replace any characters that are not alphanumeric

    return text

In [5]:
href_list = get_resume() # default is https://www.jobspider.com/job/resume-search-results.asp/words_Software%2BEngineer

# List to store information
urls = []
resume_types = []
decriptions = []

data = []
nlp_temp = spacy.load("en_core_web_lg")

for href in tqdm(href_list):
    url, resume_title, description = resume_to_text(href)

    _description = preprocess(description) # preprocessing base on obseravtion
    doc = nlp_temp(_description) # fot getting ner in _description
    row = [url, resume_title, _description, {'entities': [(ent.start_char, ent.end_char, ent.text, ent.label_, str(spacy.explain(ent.label_))) for ent in doc.ents]}]

    data.append(row)

The function takes 30 pages, which is the maximum at the present time of https://www.jobspider.com/job/resume-search-results.asp/words_Software%2BEngineer


100%|██████████| 1450/1450 [09:27<00:00,  2.55it/s]


In [6]:
import pickle
pickle.dump(data, open('scrapping_data_lg_update.json','wb'))

In [7]:
data[0]

['https://www.jobspider.com/job/view-resume-84752.html',
 'Business Administration Resume',
 'Business Administration Resume\nDesired Industry Human Resources\nSpiderID 84752\nDesired Job Location Huntersville North Carolina\nDate Posted 10 4 2024\nType of Position Full Time Permanent\nAvailability Date Immediately\nDesired Wage 60000\nU S Work Authorization Yes\nJob Level Experienced with over 2 years experience\nWilling to Travel Yes Less Than 25 \nHighest Degree Attained Other\nWilling to Relocate Yes\nObjective Analytical detail oriented and results driven professional with hands on experience in managing business operations and leading teams within the retail industry Armed with an in depth understanding of financial management budgeting and accounting principles as well as regulatory compliance and ethical business practices Adaptable individual capable of adjusting to changing market conditions and embracing innovations and strategic initiatives for business growth Equipped with

In [8]:
import pickle

with open("scrapping_data_lg_update.json", "rb") as f:
    data = pickle.load(f)

In [9]:
def get_spacy_doc(file, data):

  nlp = spacy.blank("en")
  db = DocBin()

  for url, resume_tittle, text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    for start, end, tex, l, explain in annot:
      label = explain
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity == True:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label = label, alignment_mode = 'strict')
      except:
        continue

      if span is None:
        err_data = str([start, end]) + "    " + str(text) + '\n'
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [10]:
from sklearn.model_selection import train_test_split
train, dev = train_test_split(data, test_size = 0.2, random_state = 42)

In [11]:
file = open('train_file.txt','w')

db = get_spacy_doc(file, train)
db.to_disk('train.spacy')

db = get_spacy_doc(file, dev)
db.to_disk('dev.spacy')

file.close()

100%|██████████| 1160/1160 [01:05<00:00, 17.81it/s]
100%|██████████| 290/290 [00:16<00:00, 17.68it/s]


In [12]:
!pip install spacy_transformers

Collecting spacy_transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.37.0,>=3.4.0 (from spacy_transformers)
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers<4.37.0,>=3.4.0->spacy_transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (197 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00

In [13]:
# Initialization setting
!python -m spacy init fill-config base_config.cfg config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
# This took approximately 8 hour for training on google colab pro gpu mode
!python -m spacy train config.cfg  --output generated_model --paths.train train.spacy --paths.dev dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: generated_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 122kB/s]
config.json: 100% 481/481 [00:00<00:00, 2.92MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 4.49MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 7.82MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 4.19MB/s]
  _torch_pytree._register_pytree_node(
model.safetensors: 100% 499M/499M [00:05<00:00, 85.1MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rat

In [16]:
print('hello')

hello
hello


In [None]:
model = spacy.load('generated_model/model-best')

ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer

In [None]:
# Test example
main_str = ['''Manufacturing Production Manager Resume
Desired Industry Manufacturing
SpiderID 78692
Desired Job Location Windsor Colorado
Date Posted 1 23 2017
Type of Position Full Time Permanent
Availability Date Immediately
Desired Wage 95000
U S Work Authorization Yes
Job Level Management Manager Director
Willing to Travel
Highest Degree Attained Other
Willing to Relocate Yes
Objective Dynamic and growth driven professional offering hands on management experience and comprehensive background in manufacturing and engineering operations within highly competitive setting Adept at reengineering unproductive work processes as well as in planning and implementing various sustainable and cost effective work programs to drive continuous improvement of operations Armed with exceptional organizational and critical problem solving aptitudes to formulate effective solutions on complex production and quality issues Equipped with tactical leadership capabilities in supervising and guiding teams toward the successful and timely completion of projects Proficient with Microsoft Office applications Project Management and Microsoft Visio
Experience Relevant ExperienceVestas Blades America Windsor COProduction Engineer Jan 2014 PresentContribute efforts in achieving production plan budget tooling equipment and bill of material along with quality control and safety Conceptualized new methods to optimize production levels while maintaining production costs yields quality and safety Identified and resolved process problems with effective solutions which decreased downtime and minimized costs Initiated plant trials to measure performance capabilities while ensuring updated documentation of process procedures Leveraged industry expertise in streamlining the manufacturing of turbine blades Production Supervisor Mar 2009 Jan 2014Rendered oversight to more than 70 employees to guarantee accordance of operation with production plan and goals Observed strict compliance with safety and quality guidelines and handled inventory control created schedules delegated work and facilitated training of staff Generated production reports for production and operations managers regarding production areas performance Served as a Shells Lighthouse project member while drafting and modifying all standard operating procedures Functioned as department lead for production quality training and process improvements Closely monitored operations productivity to determine areas for improvement in overall production process Pioneered the development of all shells production process job cards that decreased process times and improved efficiency and quality through changed production flow layout Anheuser Busch Fort Collis COBrew House Control Panel Operator Mar 2007 Mar 2009Efficiently administered beer brewing process from raw material selection and recipe formulation through the use of Siemens software Strictly enforced standard operating procedures and safe working practices Determined and evaluated all critical control points to achieve consistent product quality within allotted schedules Other ExperienceUnited States Airways Denver CoCustomer Service Representative Kroger Co King Soopers Smiths Food Drug Fort Collins CoGrocery Manager Head Clerk Night Crew Manager Front End ManagerInventory Control Manager Warehouse Manager
Education EducationAssociate of Science with emphasis in chemistry and biologyFront Range Community College Fort Collins COPharmacy Pre Requisites for Doctor of Pharmacy Pharm D University of Wyoming Laramie WYPharmacy Pre Requisites for Doctor of Pharmacy Pharm D Western Wyoming Community College Rock Springs WY Deans Honor RollProfessional DevelopmentCertificationsSupervisor Certification Six Sigma Yellow Belt Project ManagementQuality Management Crucial Conversations Fort Lift License Crane License
Affiliations
Skills Manufacturing Production Manager Project Planning Cost Reduction and Budget Optimization Resource Allocation Six Sigma Quality ControlLean Manufacturing Plant Management Manufacturing Inspection Administration Cross functional Team Building
Additional Information TrainingProduction Instructor Coordinator Planner Coordinator Wrote Training Document
Reference Available upon request
Candidate Contact Information
JobSpider com has chosen not to make contact information available on this page Click Contact Candidate to send this candidate a response
Manufacturing Production Manager Resume
Desired Industry Manufacturing
SpiderID 78692
Desired Job Location Windsor Colorado
Date Posted 1 23 2017
Type of Position Full Time Permanent
Availability Date Immediately
Desired Wage 95000
U S Work Authorization Yes
Job Level Management Manager Director
Willing to Travel
Highest Degree Attained Other
Willing to Relocate Yes
Objective Dynamic and growth driven professional offering hands on management experience and comprehensive background in manufacturing and engineering operations within highly competitive setting Adept at reengineering unproductive work processes as well as in planning and implementing various sustainable and cost effective work programs to drive continuous improvement of operations Armed with exceptional organizational and critical problem solving aptitudes to formulate effective solutions on complex production and quality issues Equipped with tactical leadership capabilities in supervising and guiding teams toward the successful and timely completion of projects Proficient with Microsoft Office applications Project Management and Microsoft Visio
Experience Relevant ExperienceVestas Blades America Windsor COProduction Engineer Jan 2014 PresentContribute efforts in achieving production plan budget tooling equipment and bill of material along with quality control and safety Conceptualized new methods to optimize production levels while maintaining production costs yields quality and safety Identified and resolved process problems with effective solutions which decreased downtime and minimized costs Initiated plant trials to measure performance capabilities while ensuring updated documentation of process procedures Leveraged industry expertise in streamlining the manufacturing of turbine blades Production Supervisor Mar 2009 Jan 2014Rendered oversight to more than 70 employees to guarantee accordance of operation with production plan and goals Observed strict compliance with safety and quality guidelines and handled inventory control created schedules delegated work and facilitated training of staff Generated production reports for production and operations managers regarding production areas performance Served as a Shells Lighthouse project member while drafting and modifying all standard operating procedures Functioned as department lead for production quality training and process improvements Closely monitored operations productivity to determine areas for improvement in overall production process Pioneered the development of all shells production process job cards that decreased process times and improved efficiency and quality through changed production flow layout Anheuser Busch Fort Collis COBrew House Control Panel Operator Mar 2007 Mar 2009Efficiently administered beer brewing process from raw material selection and recipe formulation through the use of Siemens software Strictly enforced standard operating procedures and safe working practices Determined and evaluated all critical control points to achieve consistent product quality within allotted schedules Other ExperienceUnited States Airways Denver CoCustomer Service Representative Kroger Co King Soopers Smiths Food Drug Fort Collins CoGrocery Manager Head Clerk Night Crew Manager Front End ManagerInventory Control Manager Warehouse Manager
Education EducationAssociate of Science with emphasis in chemistry and biologyFront Range Community College Fort Collins COPharmacy Pre Requisites for Doctor of Pharmacy Pharm D University of Wyoming Laramie WYPharmacy Pre Requisites for Doctor of Pharmacy Pharm D Western Wyoming Community College Rock Springs WY Deans Honor RollProfessional DevelopmentCertificationsSupervisor Certification Six Sigma Yellow Belt Project ManagementQuality Management Crucial Conversations Fort Lift License Crane License
Affiliations
Skills Manufacturing Production Manager Project Planning Cost Reduction and Budget Optimization Resource Allocation Six Sigma Quality ControlLean Manufacturing Plant Management Manufacturing Inspection Administration Cross functional Team Building
Additional Information TrainingProduction Instructor Coordinator Planner Coordinator Wrote Training Document
Reference Available upon request
Candidate Contact Information
JobSpider com has chosen not to make contact information available on this page Click Contact Candidate to send this candidate a response''' ]


In [None]:
for doc in nlp.pipe(main_str, disable=["tagger", "parser"]):
  for ent in doc.ents:
    print((ent.text,ent.label_))
    # print([(ent.text, ent.label_) for ent in doc.ents])


('Manufacturing Production Manager', 'JOB_TITLE')
('Manager Director', 'JOB_TITLE')
('manufacturing and engineering operations', 'SKILL')
('reengineering unproductive work processes as well as in planning and implementing various sustainable and cost effective work programs to drive continuous improvement of operations', 'SKILL')
('Microsoft Office applications Project Management', 'TOOL')
('Microsoft Visio', 'TOOL')
('ExperienceVestas Blades America', 'ORG')
('COProduction Engineer Jan 2014 PresentContribute efforts in achieving production plan budget tooling equipment and bill of material along with quality control and safety', 'EXPERIENCE')
('EducationAssociate of Science with emphasis in chemistry', 'DEGREE')
('biologyFront Range Community College Fort Collins', 'ORG')
('Doctor of Pharmacy Pharm D', 'DEGREE')
('Doctor of Pharmacy Pharm D', 'DEGREE')
('Manufacturing Production ManagerProject', 'SKILL')
('Manufacturing Production Manager', 'JOB_TITLE')
('Manager Director', 'JOB_TITLE