# Named Entity Recognition(NER) on Twitter 

In these notewooks, I will use 5 ways to solve custom Named Entity Recognition (NER) problem on Twitter. NER is a task that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

In this dataset, we have 21 different tags for sentences.

tags = ['O', 'B-musicartist', 'I-musicartist', 'B-product', 'I-product', 'B-company', 'B-person', 'B-other', 'I-other', 'B-facility',
    'I-facility', 'B-sportsteam', 'B-geo-loc', 'I-geo-loc', 'I-company', 'I-person', 'B-movie', 'I-movie', 'B-tvshow', 'I-tvshow',
    'I-sportsteam'],

where 'B-' and 'I-' prefixes stand for the beginning and inside of the entity, 'O' stands for out of tag or no tag.



### Models

In the following three notebooks, we will use five ways to examine the dataset.

- Naive Bayes multinomial model
- Conditional Random Fields (CRFs)
- Custom SpaCy
- <mark>BERT in Spark NLP</mark>
- Simple Transformer 

In this notebook we will discuss BERT in Spark NLP. Embedding with 'bert_base_cased', the  Neural Network architecture behind NerDLApproach is Char CNNs - BiLSTM - CRF.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!python -V

#### Installation(for google colab)

In [None]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.5.4

#### Import libraries and datasets

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
spark = sparknlp.start(gpu=True)
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

In [None]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.5.1")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1")

    return builder.getOrCreate()

  
spark = start(gpu=True)

As we need to fit the form of CoNLL dataset, we add a new column "pos" to "train.txt" and "text.txt". All cells in column "pos" are "NNP"(Later we won't use this column).

In [None]:
from sparknlp.training import CoNLL

train_data = CoNLL().readDataset(spark, './drive/My Drive/NER/data/train1.txt')
train_data.show(1)

In [None]:
train_data.count()

#### Loading Bert with poolingLayer -2
- setPoolingLayer(-2) is better than setPoolingLayer(0)

In [None]:
bert_annotator = BertEmbeddings.pretrained('bert_base_cased', 'en') \
 .setInputCols(["sentence",'token'])\
 .setOutputCol("bert")\
 .setCaseSensitive(False)\
 .setPoolingLayer(-2)

Transform test_data 

In [None]:
from sparknlp.training import CoNLL

test_data = CoNLL().readDataset(spark, './drive/My Drive/NER/data/test1.txt')

test_data = bert_annotator.transform(test_data)

test_data.show(3)

In [None]:
test_data.count()

In [None]:
test_data.write.parquet("test_withEmbeds.parquet")

#### Build Model

In [None]:
nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "bert"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(15)\
  .setLr(0.001)\
  .setPo(0.005)\
  .setBatchSize(8)\
  .setRandomSeed(0)\
  .setVerbose(1)\
  .setValidationSplit(0.2)\
  .setEvaluationLogExtended(True) \
  .setEnableOutputLogs(True)\
  .setIncludeConfidence(True)\
  .setTestDataset("test_withEmbeds.parquet")

In [None]:
pipeline = Pipeline(
    stages = [
    bert_annotator,
    nerTagger
  ])

#### Train Model

In [None]:
import time
start = time.time()
ner_model = pipeline.fit(train_data)
print(time.time()-start)

#### Evaluation

In [None]:
start = time.time()
predictions_train = ner_model.transform(train_data)

import pyspark.sql.functions as F

df_train = predictions_train.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()
print(time.time()-start)        

In [None]:
predictions_test = ner_model.transform(test_data)

import pyspark.sql.functions as F

df_test = predictions_test.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()

In [None]:
from sklearn.metrics import classification_report
import numpy as np

In [None]:
y_train = df_train.ground_truth.values
y_train_pred = df_train.prediction.values

classes = np.unique(y_train)
classes = classes.tolist()

In [None]:
y_test = df_test.ground_truth.values
y_test_pred = df_test.prediction.values

In [None]:
new_classes = classes.copy()
new_classes = new_classes[:-1]
new_classes

In [None]:
from sklearn.metrics import f1_score
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(f1_score(y_pred=y_train_pred, y_true=y_train, labels=classes, average='micro'))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(f1_score(y_pred=y_test_pred, y_true=y_test, labels=classes, average='micro'))

In [None]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(classification_report(y_pred=y_train_pred, y_true=y_train, labels=new_classes))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(classification_report(y_pred=y_test_pred, y_true=y_test, labels=new_classes))