<a href="https://colab.research.google.com/github/is5558/colab_samples/blob/main/tutorials/streamlit_notebooks/SPELL_CHECKER_EN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/SPELL_CHECKER_EN.ipynb)




# **Spell check your text documents**

## 1. Colab Setup

Install dependencies

In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark spark-nlp

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/718.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m716.8/718.9 kB[0m [31m30.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.9/718.9 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

def initialize_spark_nlp():
    try:
        spark = sparknlp.start()
        print("Spark NLP version:", sparknlp.version())
        return spark
    except Exception as e:
        print("Error initializing Spark NLP session:", str(e))
        raise

def load_pipeline(pipeline_name='check_spelling', lang='en'):

    try:
        return PretrainedPipeline(pipeline_name, lang=lang)
    except Exception as e:
        print(f"Error loading pipeline '{pipeline_name}':", str(e))
        raise

def get_corrected_text(annotations):
    try:
        corrected_tokens = [token.result for token in annotations['checked']]
        return " ".join(corrected_tokens).replace(" ,", ",").replace(" .", ".")
    except KeyError:
        print("Error: 'checked' key not found in annotations.")
        return ""

def main():
    text = (
        "Yesturday, I went to the libary to borow a book about anciant civilizations. "
        "The wether was pleasent, so I decidid to walk insted of taking the buss. On the way, "
        "I saw a restuarent that lookt intresting, and I plan to viset it soon."
    )

    try:
        # Initialize Spark NLP and load the pipeline
        spark = initialize_spark_nlp()
        pipeline = load_pipeline()

        # Annotate text
        annotations = pipeline.fullAnnotate(text)[0]

        # Get and print corrected text
        corrected_text = get_corrected_text(annotations)
        print("*"*77)
        print("Original Text:\n", text)
        print("Corrected Text:\n", corrected_text)
        print("*"*77)

    except Exception as e:
        print("An unexpected error occurred:", str(e))

main()

Spark NLP version: 6.0.4
check_spelling download started this may take some time.
Approx size to download 884.9 KB
[OK!]
*****************************************************************************
Original Text:
 Yesturday, I went to the libary to borow a book about anciant civilizations. The wether was pleasent, so I decidid to walk insted of taking the buss. On the way, I saw a restuarent that lookt intresting, and I plan to viset it soon.
Corrected Text:
 Yesterday, I went to the library to borrow a book about ancient civilizations. The whether was pleasant, so I decided to walk instead of taking the bus. On the way, I saw a restuarent that looks interesting, and I plan to visit it soon.
*****************************************************************************


In [2]:
! wget https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-5.1.3.jar -O spark-nlp-5.1.3.jar


--2025-07-10 13:26:35--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/jars/spark-nlp-assembly-5.1.3.jar
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.199.8, 54.231.160.248, 54.231.236.224, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.199.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 708534094 (676M) [application/java-archive]
Saving to: ‘spark-nlp-5.1.3.jar’


2025-07-10 13:26:48 (51.5 MB/s) - ‘spark-nlp-5.1.3.jar’ saved [708534094/708534094]



In [6]:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import *
from pyspark.ml import Pipeline

def initialize_spark_nlp():
    spark = SparkSession.builder \
        .appName("check_spelling") \
        .config("spark.jars", "/content/spark-nlp-5.1.3.jar") \
        .getOrCreate()
    return spark

def main():
    spark = initialize_spark_nlp()
    print("Spark NLP version:", sparknlp.version())
    print("*" * 77)

    data = spark.createDataFrame([["Yesturday, I went to the libary and saw elefants."]]).toDF("text")

    document = DocumentAssembler().setInputCol("text").setOutputCol("document")
    tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
    spell = NorvigSweetingModel.pretrained() \
        .setInputCols(["token"]) \
        .setOutputCol("spell")

    pipeline = Pipeline(stages=[document, tokenizer, spell])
    model = pipeline.fit(data)
    result = model.transform(data)

    result.select("spell.result").show(truncate=False)

if __name__ == "__main__":
    main()

Spark NLP version: 6.0.5
*****************************************************************************
spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]
+----------------------------------------------------------------+
|result                                                          |
+----------------------------------------------------------------+
|[Yesterday, ,, I, went, to, the, library, and, saw, elefants, .]|
+----------------------------------------------------------------+

