https://skeptric.com/notebooks/Parsing%20Experience%20from%20Adzuna%20Job%20Ads.html

In [2]:
import re
import pandas as pd
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy import displacy
from IPython.display import HTML, display

2022-12-09 00:06:03.287034: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
spacy.__version__

'3.4.2'

In [5]:
dfs = []
for split in ['Train', 'Valid', 'Test']:
    dfs.append(pd.read_csv("../data/project_data/data-jobs-20221123.csv").assign(split=split))
df = pd.concat(dfs, sort=False, ignore_index=True)
del dfs

In [6]:
df.split.value_counts()

Train    452
Valid    452
Test     452
Name: split, dtype: int64

In [9]:
ads = list(df.description)
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 26.6 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))

In [13]:
highlight_terms(['experience'], ads[:10])

In [15]:
examples = [
    'Knowledge of access authentication methods',
    'Knowledge of authentication, authorization, and access control methods',
    'Knowledge of network access, identity, and access management (e.g., public key infrastructure, Oauth, OpenID, SAML, SPML)',
    'Knowledge of organizational information technology (IT) user security policies (e.g., account creation, password rules, access control)',
    'Knowledge of policy-based and risk adaptive access controls',
    'Skill in developing and applying security system access controls',
    'Skill in maintaining directory services. (e.g., Microsoft Active Directory, LDAP, etc.)']

In [19]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'experience':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'EXPERIENCE': 'lightgreen'}})
                                                                            
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'experience'}]
matcher.add('experience_noun', [pattern])

pattern = [{'LOWER': 'experience'}, {'POS': 'ADP'}, {'POS': {'IN': ('DET', 'NOUN', 'PROPN')}, 'OP': '+'}]
matcher.add('experience_adp', [pattern])

doc = nlp(examples[0])
matcher(doc)

show_extraction(examples, matcher)

In [20]:
def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

list(get_extractions(ads[:3], matcher))

[('experience as Business Intelligence Engineer',
  0,
  6,
  11,
  'experience_adp',
  0,
  24),
 ('Experience with data visualization', 0, 35, 39, 'experience_adp', 24, 126),
 ('seller experience', 0, 522, 524, 'experience_noun', 501, 550),
 ('experience as a Business Intelligence Engineer',
  0,
  670,
  676,
  'experience_adp',
  635,
  689),
 ('Experience in data mining', 0, 704, 708, 'experience_adp', 703, 730),
 ('experience in technologies', 1, 138, 141, 'experience_adp', 118, 143),
 ('o Experience', 1, 275, 277, 'experience_noun', 258, 384),
 ('Experience with JIRA', 1, 440, 443, 'experience_adp', 384, 623),
 ('experience in IT', 2, 530, 533, 'experience_adp', 516, 546),
 ('experience in systems integration',
  2,
  552,
  556,
  'experience_adp',
  546,
  571),
 ('administration experience', 2, 598, 600, 'experience_noun', 571, 629),
 ('management experience', 2, 640, 642, 'experience_noun', 629, 740),
 ('system administration experience',
  2,
  664,
  667,
  'experience_nou

In [25]:
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(df[:n_max].description, *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df, how='left', left_on='docidx', right_index=True)

%time

ent_df = extract_df(matcher, n_max=1000)
ent_df.head()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.72 µs


Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,job_title,role,description,split
0,experience as Business Intelligence Engineer,0,6,11,experience_adp,0,24,"Business Intelligence Engineer, Identity and A...",engineer,· 3+ years in relevant experience as Business ...,Train
1,Experience with data visualization,0,35,39,experience_adp,24,126,"Business Intelligence Engineer, Identity and A...",engineer,· 3+ years in relevant experience as Business ...,Train
2,seller experience,0,522,524,experience_noun,501,550,"Business Intelligence Engineer, Identity and A...",engineer,· 3+ years in relevant experience as Business ...,Train
3,experience as a Business Intelligence Engineer,0,670,676,experience_adp,635,689,"Business Intelligence Engineer, Identity and A...",engineer,· 3+ years in relevant experience as Business ...,Train
4,Experience in data mining,0,704,708,experience_adp,703,730,"Business Intelligence Engineer, Identity and A...",engineer,· 3+ years in relevant experience as Business ...,Train


In [30]:
def showent(docidx, start, end, label, sent_start, sent_end, **kwargs):
    # We don't need to parse it, so just make_doc
    doc = nlp.make_doc(ads[docidx])
    doc.ents = [Span(doc, start, end, label)]
    sent = doc[sent_start:sent_end]
    displacy.render(sent, style='ent')
    
def showent_df(df):
    for idx, row in df.iterrows():
        showent(**row)

showent_df(ent_df.query('text == "experience in a"').head())

In [33]:
def extract_noun_phrase_experience(doc):
    for np in doc.noun_chunks:
        if np[-1].lower_ == 'experience':
            if len(np) > 1:
                yield 'EXPERIENCE', np[0].i, np[-1].i

show_extraction(examples, extract_noun_phrase_experience)

show_extraction(ads[:10], extract_noun_phrase_experience)

In [35]:
showent_df(ent_df.query("text=='Skills'").head(5))

experience_qualifiers = ['previous', 'prior', 'following', 'recent', 'the above', 'past',
                         
                         'proven', 'demonstrable', 'demonstrated', 'relevant', 'significant', 'practical',
                         'essential', 'equivalent', 'desirable', 'required', 'considerable', 'similar',
                         'working', 'specific', 'qualified', 'direct', 'hands on', 'handson', 
                         
                         'strong', 'solid', 'good', 'substantial', 'excellent', 'the right', 'valuable', 'invaluable',
                         
                         'some', 'any', 'none', 'much', 'extensive', 'no', 'more',
                         'your', 'their',
                         'years', 'months',
                         'uk',
                        ]

stopwords = ['a', 'an', '*', '**', '•', 'this', 'the', ':', 'Skills']

experience_qualifier_pattern = rf'\b(?:{"|".join(experience_qualifiers)})\b'

experience_qualifier_pattern

'\\b(?:previous|prior|following|recent|the above|past|proven|demonstrable|demonstrated|relevant|significant|practical|essential|equivalent|desirable|required|considerable|similar|working|specific|qualified|direct|hands on|handson|strong|solid|good|substantial|excellent|the right|valuable|invaluable|some|any|none|much|extensive|no|more|your|their|years|months|uk)\\b'

KeyError: "Column(s) ['Company', 'Id', 'SourceName'] do not exist"