In [1]:
import spacy
import skweak
import os
import re

In [2]:
## Load the dataset
path = os.getcwd()
files = os.listdir(path + '/../data')
files_txt = [i for i in files if i.endswith('.txt')]
# read all content in txt files
texts = []
for file in files_txt:
    with open(path + '/../data/' + file, 'r') as f:
        text = f.read()
        text = re.sub('\\s+', ' ', text)
        texts.append(text)

In [3]:
nlp = spacy.load('en_core_web_md', disable=['ner', 'lemmatizer'])

In [4]:
sample_text = """

Airbnb was born in 2007 when two Hosts welcomed three guests to their San Francisco home, and has since grown to over 4 million Hosts who have welcomed more than 1 billion guest arrivals in almost every country across the globe. Every day, Hosts offer unique stays and experiences that make it possible for guests to connect with communities in a more authentic way.The Community You Will Join: 
The mission of the Airbnb Global Safety  Security (GSS) team is to ensure the safety and security of our Airfam as well as provide peace of mind for Airbnbs global community through proactive security risk management.  You will work with seasoned professionals who have extensive experience developing, implementing and leading world class security programs, to include Global Security Operations, Global Security Intelligence Center, Law Enforcement Operations, Risk Intelligence, Trust  Safety Advisory Coalition, Executive Protection, Global Travel Risk Management, Global Special Events, and Global Threat Assessment Program.

Understand data needs by interfacing with fellow Analytics Engineers, Data Scientists, Data Engineers, and Business Partners
Architect, build, and launch efficient  reliable data models and pipelines in partnership with Data Engineering
Design and implement metrics and dimensions to enable analysis and predictive modeling
Design and develop data resources to enable self-serve data consumption
Build tools for auditing, error logging, and validating data tables
Define logging needs in partnership with Data Engineering
Define and share best practices on metric, dimension, and data model development for analytics use
Build and improve data tooling in partnership with Data Platform teams
Be a technical expert on data model usage
Own and review code changes to certified metric and dimension definitions
Manage communication of data model updates and changes across organization
Ensure data models are fully documented, and metrics and dimensions have clear descriptions and metadata

Minimum Qualifications:

Passion for high data quality and scaling data science work
6 years of relevant industry experience
Strong skills in SQL and distributed system optimization (e.g. Spark, Presto, Hive)
Experience in schema design and dimensional data modeling
Experience in at least one programming language for data analysis (e.g. Python, R)
Proven ability to succeed in both collaborative and independent work environments
Detail-oriented and excited to learn new skills and tools
Strong influence and relationship management skills

Preferred Qualifications:

Experience with an ETL framework like Airflow
Python, Scala, Superset preferred.
Effective story-telling  articulation skills  ability to convert analytical output into clear, concise, and persuasive insights  recommendations for technical  non-technical audience
An eye for design when it comes to dashboards and visualization tools
Familiarity with experimentation and machine learning technique
"""
sample_text = re.sub('\\s+', ' ', sample_text)

In [5]:
doc = nlp(sample_text)

In [6]:
annotator = skweak.spacy.ModelAnnotator("spacy_trf", "en_core_web_trf")

In [7]:
doc = annotator(doc)

In [8]:
skweak.utils.display_entities(doc, "spacy_trf")

## Gazetteer

In [9]:
tries = skweak.gazetteers.extract_json_data("skills.json", spacy_model='en_core_web_trf')

Extracting data from skills.json
Populating trie for class PROGRAMMING_LANGUAGES (number: 62)
Populating trie for class SOFT_SKILLS (number: 16)
Populating trie for class TECH_SKILLS (number: 49)


In [10]:
gazetteer_annotator = skweak.gazetteers.GazetteerAnnotator("skills", tries)

In [11]:
doc = gazetteer_annotator(doc)

In [12]:
skweak.utils.display_entities(doc, "skills")

## Pipeline

In [13]:
docs = []
for text in texts[:10]:
    doc = nlp(text)
    doc = annotator(doc)
    doc = gazetteer_annotator(doc)
    docs.append(doc)

In [14]:
print(len(docs), len(texts))

10 10230


## Hidden Markov Model

In [15]:
hmm = skweak.generative.HMM("hmm", ["TECH_SKILLS", "SOFT_SKILLS", "PROGRAMMING_LANGUAGES", "PRODUCT"])

In [16]:
hmm.fit(docs)

Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2
Finished E-step with 10 documents
Starting iteration 3
Finished E-step with 10 documents
Starting iteration 4
Finished E-step with 10 documents


         1    -507.06148181             +nan
         2    -500.43958387      +6.62189794
         3    -497.26634884      +3.17323503
         4    -495.44290048      +1.82344836


In [17]:
skweak.utils.display_entities(hmm(docs[0]), "hmm")

In [18]:
hmm_docs = []
for doc in docs:
    hmm_doc = hmm(doc)
    hmm_doc.ents = hmm_doc.spans["hmm"]
    hmm_docs.append(hmm_doc)

In [19]:
len(hmm_docs)

10

In [20]:
skweak.utils.docbin_writer(hmm_docs, "../training_data/hmm_training_data.spacy")

Write to ../training_data/hmm_training_data.spacy...done


In [21]:
!spacy init config - --lang en --pipeline ner --optimize accuracy | \
spacy train - --paths.train ../training_data/hmm_training_data.spacy  --paths.dev ../training_data/hmm_training_data.spacy \
--initialize.vectors en_core_web_md --output ./training_output

[38;5;4mℹ Saving to output directory: training_output[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    301.67    0.00    0.00    0.00    0.00
^C
