In [1]:
import json
import os
import re

# Data

In [2]:
files = os.listdir("../software-jobs/")
print("Number of files: ", len(files))

Number of files:  5351


In [3]:
# Read all files
data = []
for file in files:
    with open("../software-jobs/" + file, "r") as f:
        d = json.load(f)
        data.append(d['content'])

In [5]:
with open("./keywords.json", "r") as f:
    keywords = json.load(f)

In [6]:
def annotate_job_description(job_description):
    annotation = {'text': job_description, 'entities': []}

    for label, keyword_list in keywords.items():
        for keyword in keyword_list:
            pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
            matches = pattern.finditer(job_description)

            for match in matches:
                start = match.start()
                end = match.end()
                annotation['entities'].append((start, end, label))
    return annotation

In [7]:
annotate_job_description(data[0])

{'text': 'This is an exciting opportunity for a software engineer passionate about open source software, Linux, containers, virtualization and Ubuntu Server. Come build a rewarding, meaningful career working with the best and brightest people in technology at Canonical, a growing international software company.\nUbuntu Server is a platform that enables amazing technology the whole world over. Applicants to this role will have the opportunity to bolster the Ubuntu Server as a platform for containers and virtual machines. They will get to work with our experienced team and have the benefits of learning and growing alongside the best engineers in the business.\nWhat you’ll do\nCollaborate proactively with a distributed team\nWrite high quality code to create new features\nDebug issues and produce high quality code to fix them\nReview code produced by other engineers\nDiscuss ideas and collaborate on finding good solutions\nWork from home with global travel 2 to 4 weeks for internal and ex

In [8]:
training_data = []
for job_description in data:
    training_data.append(annotate_job_description(job_description))

In [13]:
import spacy


def convert_to_doc(job_data):
    nlp = spacy.blank("en")

    doc = nlp.make_doc(job_data['text'])
    ents = []
    for start, end, label in job_data['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
        filtered_ents = spacy.util.filter_spans(ents)
        doc.ents = filtered_ents
    return doc


spacy.displacy.render(convert_to_doc(training_data[2]), style="ent", jupyter=True)

In [14]:
training_docs = [convert_to_doc(data) for data in training_data]

# Convert to Label Studio Format

In [15]:
from itertools import groupby


def doc_to_spans(_doc):
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in _doc]
    results = []
    entities = set()
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        text = ' '.join(item[0] for item in group)
        end = last + len(word)
        results.append({
            'from_name': 'label',
            'to_name': 'text',
            'type': 'labels',
            'value': {
                'start': start,
                'end': end,
                'text': text,
                'labels': [entity]
            }
        })
        entities.add(entity)

    return results, entities

In [16]:
entities = set()
tasks = []
for i, doc in enumerate(training_docs):
    predictions = []
    spans, ents = doc_to_spans(doc)
    entities |= ents
    predictions.append({
        'model_version': 'en_core_web_md',
        'result': spans,
    })
    
    tasks.append({
        'data': {'text': data[i] or ''},
        'predictions': predictions
    })

In [17]:
# Save Label Studio tasks.json
print(f'Save {len(tasks)} tasks to "tasks.json"')
with open('../label-studio-files/v1/tasks.json', mode='w') as f:
    json.dump(tasks, f, indent=2)

Save 5351 tasks to "tasks.json"


In [18]:
# Save class labels as a txt file
print('Named entities are saved to "named_entities.txt"')
with open('../label-studio-files/v1/named_entities.txt', mode='w') as f:
    f.write('\n'.join(sorted(entities)))

Named entities are saved to "named_entities.txt"


### Fix the annotations by first importing the annotations and then exporting them again.