In [1]:
import spacy
import skweak
import os
import re

In [2]:
# read the skill.json file and for each key, update the value to a set values and to lower case and save it to a new file keywords.json
import json
with open('skill.json', 'r') as f:
    data = json.load(f)
    for key in data.keys():
        data[key] = set(data[key])
        data[key] = [i.lower() for i in data[key]]
    with open('keywords.json', 'w') as f:
        json.dump(data, f)
features = data.keys()

In [3]:
nlp = spacy.load('en_core_web_md', disable=['ner', 'lemmatizer'])

In [4]:
sample_text = """

Airbnb was born in 2007 when two Hosts welcomed three guests to their San Francisco home, and has since grown to over 4 million Hosts who have welcomed more than 1 billion guest arrivals in almost every country across the globe. Every day, Hosts offer unique stays and experiences that make it possible for guests to connect with communities in a more authentic way.The Community You Will Join: 
The mission of the Airbnb Global Safety  Security (GSS) team is to ensure the safety and security of our Airfam as well as provide peace of mind for Airbnbs global community through proactive security risk management.  You will work with seasoned professionals who have extensive experience developing, implementing and leading world class security programs, to include Global Security Operations, Global Security Intelligence Center, Law Enforcement Operations, Risk Intelligence, Trust  Safety Advisory Coalition, Executive Protection, Global Travel Risk Management, Global Special Events, and Global Threat Assessment Program.

Understand data needs by interfacing with fellow Analytics Engineers, Data Scientists, Data Engineers, and Business Partners
Architect, build, and launch efficient  reliable data models and pipelines in partnership with Data Engineering
Design and implement metrics and dimensions to enable analysis and predictive modeling
Design and develop data resources to enable self-serve data consumption
Build tools for auditing, error logging, and validating data tables
Define logging needs in partnership with Data Engineering
Define and share best practices on metric, dimension, and data model development for analytics use
Build and improve data tooling in partnership with Data Platform teams
Be a technical expert on data model usage
Own and review code changes to certified metric and dimension definitions
Manage communication of data model updates and changes across organization
Ensure data models are fully documented, and metrics and dimensions have clear descriptions and metadata

Minimum Qualifications:

Passion for high data quality and scaling data science work
6 years of relevant industry experience
Strong skills in SQL and distributed system optimization (e.g. Spark, Presto, Hive)
Experience in schema design and dimensional data modeling
Experience in at least one programming language for data analysis (e.g. Python, R)
Proven ability to succeed in both collaborative and independent work environments
Detail-oriented and excited to learn new skills and tools
Strong influence and relationship management skills

Preferred Qualifications:

Experience with an ETL framework like Airflow
Python, Scala, Superset preferred.
Effective story-telling  articulation skills  ability to convert analytical output into clear, concise, and persuasive insights  recommendations for technical  non-technical audience
An eye for design when it comes to dashboards and visualization tools
Familiarity with experimentation and machine learning technique
"""
sample_text = re.sub('\\s+', ' ', sample_text)

In [5]:
doc = nlp(sample_text)

In [6]:
annotator = skweak.spacy.ModelAnnotator("spacy_trf", "en_core_web_trf")

In [7]:
doc = annotator(doc)

In [8]:
skweak.utils.display_entities(doc, "spacy_trf")

In [13]:
# read all the files in software-jobs dir
files = os.listdir("../software-jobs/")
texts = []
for file in files:
    if file.endswith(".json"): 
        with open("../software-jobs/" + file, "r") as f:
            data = json.load(f)
            texts.append(data['content'])

In [14]:
len(texts)

5407

## Gazetteer

In [15]:
# pick a random index for text from texts
import random
random_index = random.randint(0, len(texts))
sample_text = texts[random_index]
doc = nlp(sample_text)

In [17]:
tries = skweak.gazetteers.extract_json_data("keywords.json", spacy_model='en_core_web_trf')

Extracting data from keywords.json
Populating trie for class PROGRAMMING_LANGUAGES (number: 64)
Populating trie for class FRAMEWORKS&TOOLS (number: 42)
Populating trie for class DATABASES (number: 28)
Populating trie for class ROLES (number: 30)
Populating trie for class OTHERS (number: 30)
Populating trie for class PLATFORMS (number: 30)


In [18]:
gazetteer_annotator = skweak.gazetteers.GazetteerAnnotator("keywords", tries)

In [19]:
doc = gazetteer_annotator(doc)

In [20]:
skweak.utils.display_entities(doc, "keywords")

## Pipeline

In [21]:
docs = []
for text in texts[:10]:
    doc = nlp(text)
    doc = annotator(doc)
    doc = gazetteer_annotator(doc)
    docs.append(doc)

In [22]:
print(len(docs), len(texts))

10 5407


## Hidden Markov Model

In [23]:
hmm = skweak.generative.HMM("hmm", features)

In [24]:
hmm.fit(docs)

Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2
Finished E-step with 10 documents
Starting iteration 3
Finished E-step with 10 documents
Starting iteration 4
Finished E-step with 10 documents


         1    -115.59210867             +nan
         2    -110.39962373      +5.19248494
         3    -108.88601088      +1.51361285
         4    -108.67987701      +0.20613386


In [25]:
skweak.utils.display_entities(hmm(docs[0]), "hmm")

In [26]:
hmm_docs = []
for doc in docs:
    hmm_doc = hmm(doc)
    hmm_doc.ents = hmm_doc.spans["hmm"]
    hmm_docs.append(hmm_doc)

In [27]:
len(hmm_docs)

10

In [28]:
skweak.utils.docbin_writer(hmm_docs, "../training-data/hmm_training_data.spacy")

Write to ../training-data/hmm_training_data.spacy...done


In [32]:
!spacy init config - --lang en --pipeline ner --optimize accuracy | \
spacy train - --paths.train ../training-data/hmm_training_data.spacy  --paths.dev ../training-data/hmm_training_data.spacy \
--initialize.vectors en_core_web_md --output ../training-output/v1 --gpu-id 0

[38;5;4mℹ Saving to output directory: ../training-output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    412.40    0.00    0.00    0.00    0.00
 20     200       4118.88   4858.35   50.00   50.00   50.00    0.50
 40     400       1384.14     95.11  100.00  100.00  100.00    1.00
 60     600          0.00      0.00  100.00  100.00  100.00    1.00
 80     800          0.00      0.00  100.00  100.00  100.00    1.00
100    1000          0.00      0.00  100.00  100.00  100.00    1.00
120    1200          0.00      0.00  100.00  100.00  100.00    1.00
140    1400          0.00      0.00  100.00  100.00  100.00    1.00
160    1600          0.00      0.00  100.00  100.00  100.00    1.00
180    1800  