In [1]:
! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash

Installing PySpark 3.2.3 and Spark NLP 5.4.1
setup Colab for PySpark 3.2.3 and Spark NLP 5.4.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.5/281.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.2/579.2 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:
import sparknlp
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import Pipeline

spark = sparknlp.start(gpu = True)

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

Spark NLP version 5.4.1
Apache Spark version: 3.2.3


In [4]:
#download training data
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.train
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/conll2003/eng.testa

In [5]:
from sparknlp.training import CoNLL

training_data = CoNLL().readDataset(spark, './eng.train')
testing_data = CoNLL().readDataset(spark, './eng.testa')

In [6]:
print(f"(Train count: {training_data.count()} Test count: {testing_data.count()})")

(Train count: 14041 Test count: 3250)


In [7]:
training_data.select(
    F.explode(F.arrays_zip('token', 'pos', 'label')).alias("cols")
).select(
    F.col("cols.token.result").alias("token"),
    F.col("cols.pos.result").alias("pos"),
    F.col("cols.label.result").alias("ner_label")
).show(truncate=50)

+----------+---+---------+
|     token|pos|ner_label|
+----------+---+---------+
|        EU|NNP|    B-ORG|
|   rejects|VBZ|        O|
|    German| JJ|   B-MISC|
|      call| NN|        O|
|        to| TO|        O|
|   boycott| VB|        O|
|   British| JJ|   B-MISC|
|      lamb| NN|        O|
|         .|  .|        O|
|     Peter|NNP|    B-PER|
| Blackburn|NNP|    I-PER|
|  BRUSSELS|NNP|    B-LOC|
|1996-08-22| CD|        O|
|       The| DT|        O|
|  European|NNP|    B-ORG|
|Commission|NNP|    I-ORG|
|      said|VBD|        O|
|        on| IN|        O|
|  Thursday|NNP|        O|
|        it|PRP|        O|
+----------+---+---------+
only showing top 20 rows



### Training

In [8]:
embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \
                              .setInputCols("document", "token") \
                              .setOutputCol("embeddings")

deberta_v3_base download started this may take some time.
Approximate size to download 415 MB
[OK!]


In [9]:
nerTagger = NerDLApproach().setInputCols(["sentence", "token", "embeddings"])\
                            .setLabelColumn("label")\
                            .setOutputCol("ner")\
                            .setMaxEpochs(2)\
                            .setLr(0.002)\
                            .setBatchSize(16)\
                            .setRandomSeed(0)\
                            .setVerbose(1)\
                            .setValidationSplit(0.15)

In [10]:
ner_converter = NerConverter().setInputCols(['document', 'token', 'ner']) \
                              .setOutputCol('ner_chunk')

In [11]:
ner_pipeline = Pipeline(stages=[embeddings, nerTagger, ner_converter])

In [12]:
%%time
ner_model = ner_pipeline.fit(training_data.limit(5000).repartition(1))

CPU times: user 1.01 s, sys: 145 ms, total: 1.15 s
Wall time: 3min 42s


### Evalaution

In [13]:
predictions = ner_model.transform(testing_data.limit(1000))

In [14]:
preds_df = predictions.select(
    F.explode(F.arrays_zip('token', 'label', 'ner')).alias("cols")
).select(
    F.col("cols.token.result").alias("token"),
    F.col("cols.label.result").alias("ground_truth"),
    F.col("cols.ner.result").alias("prediction")
)

preds_df.show(truncate=50)

+--------------+------------+----------+
|         token|ground_truth|prediction|
+--------------+------------+----------+
|       CRICKET|           O|         O|
|             -|           O|         O|
|LEICESTERSHIRE|       B-ORG|     B-ORG|
|          TAKE|           O|         O|
|          OVER|           O|         O|
|            AT|           O|         O|
|           TOP|           O|         O|
|         AFTER|           O|         O|
|       INNINGS|           O|     B-LOC|
|       VICTORY|           O|         O|
|             .|           O|         O|
|        LONDON|       B-LOC|     B-LOC|
|    1996-08-30|           O|         O|
|          West|      B-MISC|    B-MISC|
|        Indian|      I-MISC|    I-MISC|
|   all-rounder|           O|         O|
|          Phil|       B-PER|     B-PER|
|       Simmons|       I-PER|     I-PER|
|          took|           O|         O|
|          four|           O|         O|
+--------------+------------+----------+
only showing top

In [15]:
from sklearn.metrics import classification_report

preds_df_pd = preds_df.toPandas()
print(classification_report(preds_df_pd['ground_truth'], preds_df_pd['prediction']))

              precision    recall  f1-score   support

       B-LOC       0.92      0.89      0.91       559
      B-MISC       0.75      0.84      0.79       190
       B-ORG       0.81      0.85      0.83       355
       B-PER       0.99      0.97      0.98       654
       I-LOC       0.87      0.65      0.74        69
      I-MISC       0.82      0.75      0.79        93
       I-ORG       0.83      0.83      0.83       181
       I-PER       1.00      0.97      0.99       443
           O       1.00      1.00      1.00     11589

    accuracy                           0.98     14133
   macro avg       0.89      0.86      0.87     14133
weighted avg       0.98      0.98      0.98     14133

