# Spark NER with John Snow Labs
Author: John Bonfardeci<br/>
Last Modified: 2021-05-25<br/>
This notebook documents the experiments and results of various methods for named entity recognition (NER) and the spellchecking of entities identified in a sample DPAA document.

In [5]:
import os
import re
import json
import numpy as np
import pandas as pd
import pickle

from sparknlp.base import *
from sparknlp.annotator import *              
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.common import *
from sparknlp.base import *
from sparknlp import Finisher
import sparknlp

from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F

spark = SparkSession.builder \
    .appName("DPAA NLP") \
    .master("local[*]") \
    .config("spark.driver.memory","32G") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.0.3") \
    .getOrCreate()

#sql_ctx = SQLContext(spark.sparkContext)

sparknlp.start()

In [6]:
sparknlp.version()

'3.0.3'

## Spark OCR
https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/5.Spark_OCR.ipynb

In [None]:
with open('../spark_ocr.json') as f:
    license_keys = json.load(f)

os.environ['JSL_OCR_LICENSE'] = license_keys['SPARK_OCR_LICENSE']
ocr_license = license_keys['SPARK_OCR_LICENSE']
ocr_secret = license_keys['JSL_OCR_SECRET'] 
ocr_version = license_keys['OCR_VERSION']

! pip install --upgrade spark-ocr==$ocr_version --user --extra-index-url=https://pypi.johnsnowlabs.com/$ocr_license

In [None]:
# Necessary imports from Spark OCR library
from sparkocr import start
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import display_image, to_pil_image
from sparkocr.metrics import score
import pkg_resources

## Import and Clean Sample Document

In [360]:
SAMPLE_DATA_PATH = './ner/REDACTED.txt'
sample_text = open(SAMPLE_DATA_PATH, 'r').read()
sample_text = re.sub(r'(\r\n|\n)', ' ', sample_text)
sample_text = re.sub(r',', '; ', sample_text)
sample_text = re.sub(r'\s+', ' ', sample_text)
sample_text = re.sub(r'(?<=[^\d])\-(\s+|)(?=[^\d])', '', sample_text)
df= pd.DataFrame({'text': sample_text}, index=[0])
spark_df = spark.createDataFrame(df)
spark_df.show()

+--------------------+
|                text|
+--------------------+
|searchable_pdf/do...|
+--------------------+



In [None]:
class Exploder(Transformer):
    """
    A custom transformer that inherits the Transformer class.
    This class explodes an array into a Spark dataframe column.
    """
    inputCol=None
    outputCol=None
    
    def __init__(self, inputCol=None, outputCol=None):
        super(Transformer).__init__()
        self.inputCol = inputCol
        self.outputCol = outputCol
    
    def _transform(self, spark_frame):
        col_name = self.inputCol
        out_col = self.outputCol
        return spark_frame.select([col_name])\
            .withColumn(out_col, F.explode(col_name))

## Language Detection
https://nlp.johnsnowlabs.com/2020/12/05/detect_language_21_xx.html

In [450]:
language_detector = PretrainedPipeline("detect_language_21", lang="xx")

detect_language_21 download started this may take some time.
Approx size to download 7.7 MB
[OK!]


## Spellchecker Pipeline
https://nlp.johnsnowlabs.com/2021/03/28/spellcheck_dl_en.html<br />
https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/7.Context_Spell_Checker.ipynb
<br />
While the default pipeline performs well on common English words, the pipeline requires additional training on foreign names and locations. For example, it will correct 'Bataan' (Phillipines) to 'Batman' by mistake.

In [472]:
# Experimental training terms.
training_terms = {
    '_NAME_': [
        'time', 'bataan', 'war', 'USAF', 'march', 'death', 'Tacloben',
        'tooth', 'nicanor', 'descendent', 'memorial', 'division', 'memorial division'
    ],
    '_AGE_': [],
    '_LOC_': ['philippine', 'cemetery', 'leyte'],
    '_DATE_': [],
    '_NUM_': []
}

def get_spellcheck_pipeline(model_name='spellcheck_dl', training_terms=[]):
    documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

    tokenizer = RecursiveTokenizer()\
        .setInputCols(["document"])\
        .setOutputCol("token")\
        .setPrefixes(["\"", "(", "[", "\n"])\
        .setSuffixes([".", ",", "?", ")","!", "'s"])

    spellModel = ContextSpellCheckerModel\
        .pretrained(model_name)\
        .setInputCols("token")\
        .setOutputCol("checked")\
        .setErrorThreshold(4.0)\
        .setTradeoff(6.0)
    
    # Extend vocabulary of spellchecker.
    for en_type in training_terms:
        if re.match(r'(_NAME_|_LOC_)', en_type):
            spellModel.updateVocabClass(en_type, training_terms[en_type], True)

    finisher = Finisher()\
        .setInputCols("checked")

    return Pipeline(stages = [
        documentAssembler,
        tokenizer,
        spellModel,
        finisher
    ])
    
empty_ds = spark.createDataFrame([[""]]).toDF("text")
spellcheck = LightPipeline(get_spellcheck_pipeline('spellcheck_dl', training_terms)\
                           .fit(empty_ds))

spellcheck_dl download started this may take some time.
Approximate size to download 111.4 MB
[OK!]


In [473]:
# Test
spellcheck.annotate("Bataan death march.")

{'checked': ['Bataan', 'death', 'march', '.']}

In [456]:
spellcheck.annotate("The Wary Department.")

{'checked': ['The', 'War', 'Department', '.']}

In [457]:
spellcheck.annotate("Status at Tine of Death Phil Army")

{'checked': ['Status', 'at', 'Time', 'of', 'Death', 'Phil', 'Army']}

In [458]:
spellcheck.annotate('Leyto Island Philippne Islands')

{'checked': ['Leyte', 'Island', 'Philipine', 'Islands']}

In [459]:
spellcheck.annotate('Tacloben Leyte Island Philippine Islands')

{'checked': ['Tacloben', 'Leyte', 'Island', 'Philippine', 'Islands']}

In [460]:
spellcheck.annotate('USAF')

{'checked': ['USAF']}

In [461]:
spellcheck.annotate('John is a decedent of Tom.')

{'checked': ['John', 'is', 'a', 'descendent', 'of', 'Tom', '.']}

In [474]:
spellcheck.annotate('Nicanor F')

{'checked': ['nicanor', '.']}

In [471]:
spellcheck.annotate('LELIORIAL DIVISION')

{'checked': ['VECTORIAL', 'DIVISION']}

## Entity Filter
Use to filter out undesired entity types and apply spellchecking.

In [466]:
def filter_results(ner_results):
    entities = []
    for row in ner_results:
        for meta, res in zip(row.metadata, row.result):
            if 'entity' in meta \
                and not re.match(r'(CARDINAL|TIME|QUANTITY|DATE|ORDINAL)', meta['entity']) \
                and len(res) > 2:
                entity_type = meta['entity']
                language = language_detector.annotate(res)['language'][0]
                if language == 'en':
                    checked = [w for w in spellcheck.annotate(res)['checked'] if not re.match(r'[\,\.\!\?\"\']', w)]
                    checked = " ".join(checked)
                else:
                    checked = res
                entities.append({
                    'type': entity_type, 
                    'original': res,
                    'checked': checked,
                    'language': language
                })
    return entities

## NER Entity Meanings
https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/NER_EN.ipynb<br />
<table>
    <thead><tr><th>NER</th><th>Meaning</th</tr></thead>
    <tbody>
        <tr><td>PERSON</td><td>People, including fictional.</td></tr>
        <tr><td>GPE</td><td>Countries, cities, states</td></tr>
        <tr><td>NORP</td><td>Nationalities or religious or political groups</td></tr>
        <tr><td>DATE</td><td>Absolute or relative dates or periods</td></tr>
        <tr><td>ORG</td><td>Companies, agencies, institutions, etc</td></tr>
        <tr><td>CARDINAL</td><td>Numerals that do not fall under another type</td></tr>
    </tbody>
</table>

In [398]:
def get_ner_pipeline(model_name:str='onto_100') -> PipelineModel:
    documentAssembler = DocumentAssembler() \
        .setInputCol('text') \
        .setOutputCol('document')

    tokenizer = Tokenizer() \
        .setInputCols(['document']) \
        .setOutputCol('token')

    # ner_dl and onto_100 model are trained with glove_100d, so the embeddings in
    # the pipeline should match
    embeddings = None
    if (model_name == "ner_dl") or (model_name == 'onto_100'):
        embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
            .setInputCols(["document", 'token']) \
            .setOutputCol("embeddings")

    # Bert model uses Bert embeddings
    elif model_name == 'ner_dl_bert':
        embeddings = BertEmbeddings.pretrained(name='bert_base_cased', lang='en') \
            .setInputCols(['document', 'token']) \
            .setOutputCol('embeddings')

    ner_model = NerDLModel.pretrained(model_name, 'en') \
        .setInputCols(['document', 'token', 'embeddings']) \
        .setOutputCol('ner')

    ner_converter = NerConverter() \
        .setInputCols(['document', 'token', 'ner']) \
        .setOutputCol('ner_chunk')

    pipeline = Pipeline(stages=[
        documentAssembler, 
        tokenizer,
        embeddings,
        ner_model,
        ner_converter
    ])   
    
    empty_ds = spark.createDataFrame([[""]]).toDF("text")
    return pipeline.fit(empty_ds)

## Conclusion
<p>From the NER experiments below, ONTO 100 seems to produce the best balance of useful entities with the least noise.</p>
<p>Methods:</p>
<ol>
    <li>ONTO 100</li>
    <li>NER DL</li>
    <li>Electra</li>
    <li>BERT</li>
    <li>Recognize Entities DL</li>
    <li>Stanford NER (NLTK)</li>
</ol>

## ONTO 100
ONTO 100 seems to produce just enough entities and very little noise. It produces about twice as many entities as Electra and Stanford NER (133 vs 65 & 79) when applying filters.

In [452]:
onto_pipeline_path = 'ner/onto_100_pipeline'
onto_pipeline:PipelineModel = None
    
if os.path.exists(onto_pipeline_path):
    onto_pipeline = PretrainedPipeline.from_disk(onto_pipeline_path)
    print('Loaded pipeline from', onto_pipeline_path)
else:
    onto_pipeline = get_ner_pipeline('onto_100')
    onto_pipeline.save(onto_pipeline_path)
    print('Pipeline saved to', onto_pipeline_path)
    
onto_df = nlp_model.transform(spark_df)

Loaded pipeline from ner/onto_100_pipeline


In [467]:
onto_results = onto_df.select('ner_chunk.metadata', 'ner_chunk.result').collect()
pd.DataFrame(filter_results(onto_results)).to_csv('ner/onto_ner_results.csv')

## NER DL
Produces 208 entities with more noise than ONTO 100.

In [444]:
ner_dl_pipeline_path = 'ner/ner_dl_pipeline'
ner_dl_pipeline:PipelineModel = None
    
if os.path.exists(ner_dl_pipeline_path):
    ner_dl_pipeline = PretrainedPipeline.from_disk(ner_dl_pipeline_path)
    print('Loaded pipeline from', ner_dl_pipeline_path)
else:
    ner_dl_pipeline = get_ner_pipeline('ner_dl')
    ner_dl_pipeline.save(ner_dl_pipeline_path)
    print('Pipeline saved to', ner_dl_pipeline_path)
    
ner_dl_df = ner_dl_pipeline.transform(spark_df)
ner_dl_results = ner_dl_df.select('ner_chunk.metadata', 'ner_chunk.result').collect()
pd.DataFrame(filter_results(ner_dl_results)).to_csv('ner/ner_dl_results.csv')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]
Pipeline saved to ner/ner_dl_pipeline


## NER Visualizer

In [None]:
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = ner_df.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)

## ELECTRA
https://nlp.johnsnowlabs.com/2021/03/23/onto_recognize_entities_electra_large_en.html
Elextra is the most minimalistic of the pretrained pipelines, pruducing very little noise but perhaps not enough entities.

In [None]:
electra = PretrainedPipeline('onto_recognize_entities_electra_large', lang='en')

In [412]:
annotations = electra.fullAnnotate(spark_df, column='text')

In [None]:
electra_results = annotations.select('entities.metadata', 'entities.result').collect()
pd.DataFrame(filter_results(electra_results)).to_csv('ner/electra_results.csv')

## BERT
https://nlp.johnsnowlabs.com/2021/03/23/recognize_entities_bert_en.html<br/>
BERT produces the most entities (325 after filtering) but a lot of noise/non-entities.

In [419]:
bert = PretrainedPipeline('recognize_entities_bert', lang='en')
bert_annotations = bert.fullAnnotate(spark_df, column='text')

recognize_entities_bert download started this may take some time.
Approx size to download 404.6 MB
[OK!]


In [None]:
bert_results = bert_annotations.select('entities.metadata', 'entities.result').collect()
pd.DataFrame(filter_results(bert_results)).to_csv('ner/bert_results.csv')

### Recognize Entities DL
https://nlp.johnsnowlabs.com/2021/03/23/recognize_entities_dl_en.html<br />
Identified 156 entities with filtering. Produces more noise than ONTO 100 but much less than BERT.

In [421]:
recognize = PretrainedPipeline('recognize_entities_dl', lang='en')
recognize_annotations = recognize.fullAnnotate(spark_df, column='text')

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [None]:
recognize_results = recognize_annotations.select('entities.metadata', 'entities.result').collect()
pd.DataFrame(filter_results(recognize_results)).to_csv('ner/recognize_entities.csv')

### Stanford NER with NLTK
https://www.pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/<br />
Produced 79 entities after filtering out type 'O' entities. It produced only slightly more than Electra but much less then ONTO 100 and seems too minimalistic, but produces very little noise when ignoring type 'O' objects.

In [446]:
import nltk
from itertools import groupby
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize, sent_tokenize

stanford_root = '../stanford-ner-4.2.0/stanford-ner-2020-11-17'

st = StanfordNERTagger(stanford_root+'/classifiers/english.all.3class.distsim.crf.ser.gz',
                        stanford_root+'/stanford-ner-4.2.0.jar',
                        encoding='utf-8')

sent_tokens = sent_tokenize(sample_text)
sents = [word_tokenize(sent) for sent in sent_tokens]
sent_tags = st.tag_sents(sents)
nltk_entities = []
for sent in sent_tags:
    for tag, chunk in groupby(sent, lambda x: x[1]):
        en = " ".join(w for w, t in chunk)
        checked = " ".join( spellcheck.annotate(en) )
        if tag == 'O':
            continue
        nltk_entities.append({
            'type': tag,
            'original': en,
            'checked': checked
        })

pd.DataFrame(nltk_entities).to_csv('ner/nltk_results.csv')