

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SPELL_CHECKER_EN.ipynb)




# **Spell check your text documents**

## 1. Colab Setup

Install dependencies

In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark spark-nlp

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/718.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/718.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.8/718.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

Import dependencies

In [24]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()
pipeline1 = PretrainedPipeline("check_spelling", lang="en")
pipeline2 = ContextSpellCheckerModel.pretrained('spellcheck_dl', lang="en").setInputCols("token").setOutputCol("corrected")


text = "Lets uss fixx the spellling mistakess in this sentense."
result1 = pipeline1.fullAnnotate(text)
# result2 = pipeline2.fullAnnotate(text)
for token in result1[0]["checked"]:
    print(f"{token.result} ({token.begin}-{token.end})")
# for token in result2[0]["checked"]:
#     print(f"{token.result} ({token.begin}-{token.end})")

check_spelling download started this may take some time.
Approx size to download 884.9 KB
[OK!]
spellcheck_dl download started this may take some time.
Approximate size to download 95.1 MB
[OK!]
Lets (0-3)
uss (5-7)
fix (9-12)
the (14-16)
spelling (18-26)
mistake (28-36)
in (38-39)
this (41-44)
sent (46-53)
. (54-54)


In [13]:
import json
import pandas as pd
import numpy as np
import pyspark

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

Start Spark Session

In [18]:
spark = sparknlp.start()

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)
print("Spark version:", pyspark.__version__)



Spark NLP version 6.0.4
Apache Spark version: 3.5.1
Spark version: 3.5.1


## 2. Select the NER model and construct the pipeline

In [4]:
document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = RecursiveTokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")\
  .setPrefixes(["\"", "(", "[", "\n"])\
  .setSuffixes([".", ",", "?", ")","!", "‘s"])

spell_model = ContextSpellCheckerModel\
    .pretrained('spellcheck_dl')\
    .setInputCols("token")\
    .setOutputCol("corrected")

finisher = Finisher().setInputCols("corrected")

light_pipeline = Pipeline(stages = [document_assembler,
                                    tokenizer,
                                    spell_model,
                                    finisher])
## For comparison
full_pipeline = Pipeline(stages = [document_assembler,
                                   tokenizer,
                                   spell_model])

empty_ds = spark.createDataFrame([[""]]).toDF("text")
pipeline_model = full_pipeline.fit(empty_ds)
l_pipeline_model = LightPipeline(light_pipeline.fit(empty_ds))

spellcheck_dl download started this may take some time.
Approximate size to download 95.1 MB
[OK!]


## 3. Create example inputs

In [5]:
# Enter examples as strings in this array
input_list = ["Plaese alliow me tao introdduce myhelf, I am a man of waelth und tiaste"]

## 4. Use the pipeline to create outputs

Full Pipeline

In [6]:
df = spark.createDataFrame(pd.DataFrame({"text": input_list}))
result = pipeline_model.transform(df)

Light Pipeline

In [7]:
# Light pipelines expect a single example.
light_result = l_pipeline_model.annotate(input_list[0])

## 5. Visualize results

Visualize comparison as dataframe

In [8]:
result.select(F.explode(F.arrays_zip(result.token.result,
                                     result.corrected.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("original"),
              F.expr("cols['1']").alias('corrected')).show(truncate=False)

+----------+---------+
|original  |corrected|
+----------+---------+
|Plaese    |Please   |
|alliow    |allow    |
|me        |me       |
|tao       |to       |
|introdduce|introduce|
|myhelf    |myself   |
|,         |,        |
|I         |I        |
|am        |am       |
|a         |a        |
|man       |man      |
|of        |of       |
|waelth    |wealth   |
|und       |and      |
|tiaste    |taste    |
+----------+---------+



Vizualise light pipeline and finished result

In [9]:
# this finished result does not need parsing and can directly be used an any other task.
light_result['corrected']

['Please',
 'allow',
 'me',
 'to',
 'introduce',
 'myself',
 ',',
 'I',
 'am',
 'a',
 'man',
 'of',
 'wealth',
 'and',
 'taste']