# Spark NLP Basics and Pretrained Pipelines

# Load dependent libraries and jars

In [1]:
import $ivy.`org.apache.spark::spark-sql:3.0.1`
import $ivy.`org.apache.spark::spark-mllib:3.0.1`
import $ivy.`org.apache.spark:spark-avro_2.12:3.0.1`
import $ivy.`com.johnsnowlabs.nlp::spark-nlp:3.1.0`
interp.load.cp(os.Path("/"))

[32mimport [39m[36m$ivy.$                                  
[39m
[32mimport [39m[36m$ivy.$                                    
[39m
[32mimport [39m[36m$ivy.$                                       
[39m
[32mimport [39m[36m$ivy.$                                      
[39m

# Create spark session and context

In [2]:
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.NotebookSparkSession
import org.apache.spark.SparkContext

Logger.getLogger("org").setLevel(Level.OFF)
val sparkSession = NotebookSparkSession.builder()
    .master("local[*]")
    .appName("My App")
    .getOrCreate()

implicit def sparkContext: SparkContext = sparkSession.sparkContext

import sparkSession.implicits._

Loading spark-stubs
Creating SparkSession


[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m
[32mimport [39m[36morg.apache.spark.sql.NotebookSparkSession
[39m
[32mimport [39m[36morg.apache.spark.SparkContext

[39m
[36msparkSession[39m: [32morg[39m.[32mapache[39m.[32mspark[39m.[32msql[39m.[32mSparkSession[39m = org.apache.spark.sql.SparkSession@18f79ec4
defined [32mfunction[39m [36msparkContext[39m
[32mimport [39m[36msparkSession.implicits._[39m

### Explain Document ML

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- Lemmatizer
- Stemmer
- Part of Speech
- SpellChecker (Norvig)

In [3]:
import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline

val pipeline = PretrainedPipeline("explain_document_ml", lang="en")

explain_document_ml download started this may take some time.
Approximate size to download 9.1 MB
Download done! Loading the resource.


[32mimport [39m[36mcom.johnsnowlabs.nlp.pretrained.PretrainedPipeline

[39m
[36mpipeline[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"explain_document_ml"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [4]:
pipeline.model.stages

[36mres3[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mTransformer[39m] = [33mArray[39m(
  document_2ec0b742eccd,
  SENTENCE_98fb8e28cb7b,
  REGEX_TOKENIZER_4428d18fe84e,
  SPELL_e4ea67180337,
  LEMMATIZER_c62ad8f355f9,
  STEMMER_75edcc4a9cdb,
  POS_3df318ebb81c
)

In [4]:
// Load pretrained pipeline from local disk:
// import java.lang.System
// val userHome = System.getProperty("user.home")

// val pipeline_local = PretrainedPipeline.fromDisk(s"$userHome/cache_pretrained/explain_document_ml_en_3.1.3_3.0_1632168876620")

In [5]:
val testDoc = """
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""

[36mtestDoc[39m: [32mString[39m = [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m

In [6]:
val result = pipeline.annotate(testDoc)

[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"document"[39m -> [33mList[39m(
    [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
  ),
  [32m"spell"[39m -> [33mList[39m(
    [32m"Peter"[39m,
    [32m"is"[39m,
    [32m"a"[39m,
    [32m"very"[39m,
    [32m"good"[39m,
    [32m"person"[39m,
    [32m"."[39m,
    [32m"My"[39m,
    [32m"life"[39m,
    [32m"in"[39m,
    [32m"Russia"[39m,
    [32m"is"[39m,
    [32m"very"[39m,
    [32m"interesting"[39m,
    [32m"."[39m,
    [32m"John"[39m,
    [32m"and"[39m,
    [32m"Peter"[39m,
    [32m"are"[39m,
    [32m"brothers"[39m,
    [32m"."[39m,
    [32m"However"[39m,
    [32m"they"[39m,
    [32m"do

In [7]:
result.keySet

[36mres6[39m: [32mSet[39m[[32mString[39m] = [33mSet[39m(
  [32m"document"[39m,
  [32m"spell"[39m,
  [32m"pos"[39m,
  [32m"lemmas"[39m,
  [32m"token"[39m,
  [32m"stems"[39m,
  [32m"sentence"[39m
)

In [8]:
result.get("sentence")

[36mres7[39m: [32mOption[39m[[32mSeq[39m[[32mString[39m]] = [33mSome[39m(
  [33mList[39m(
    [32m"Peter is a very good persn."[39m,
    [32m"My life in Russia is very intersting."[39m,
    [32m"John and Peter are brthers."[39m,
    [32m"However they don't support each other that much."[39m,
    [32m"Lucas Dunbercker is no longer happy."[39m,
    [32m"He has a good car though."[39m,
    [32m"Europe is very culture rich."[39m,
    [32m"There are huge churches!"[39m,
    [32m"and big houses!"[39m
  )
)

In [9]:
result.get("token")

[36mres8[39m: [32mOption[39m[[32mSeq[39m[[32mString[39m]] = [33mSome[39m(
  [33mList[39m(
    [32m"Peter"[39m,
    [32m"is"[39m,
    [32m"a"[39m,
    [32m"very"[39m,
    [32m"good"[39m,
    [32m"persn"[39m,
    [32m"."[39m,
    [32m"My"[39m,
    [32m"life"[39m,
    [32m"in"[39m,
    [32m"Russia"[39m,
    [32m"is"[39m,
    [32m"very"[39m,
    [32m"intersting"[39m,
    [32m"."[39m,
    [32m"John"[39m,
    [32m"and"[39m,
    [32m"Peter"[39m,
    [32m"are"[39m,
    [32m"brthers"[39m,
    [32m"."[39m,
    [32m"However"[39m,
    [32m"they"[39m,
    [32m"don't"[39m,
    [32m"support"[39m,
    [32m"each"[39m,
    [32m"other"[39m,
    [32m"that"[39m,
    [32m"much"[39m,
    [32m"."[39m,
    [32m"Lucas"[39m,
    [32m"Dunbercker"[39m,
    [32m"is"[39m,
    [32m"no"[39m,
    [32m"longer"[39m,
    [32m"happy"[39m,
    [32m"."[39m,
...

In [10]:
for {
    t <- result.get("token")
    p <- result.get("pos")
}yield t.zip(p)

[36mres9[39m: [32mOption[39m[[32mSeq[39m[([32mString[39m, [32mString[39m)]] = [33mSome[39m(
  [33mList[39m(
    ([32m"Peter"[39m, [32m"NNP"[39m),
    ([32m"is"[39m, [32m"VBZ"[39m),
    ([32m"a"[39m, [32m"DT"[39m),
    ([32m"very"[39m, [32m"RB"[39m),
    ([32m"good"[39m, [32m"JJ"[39m),
    ([32m"persn"[39m, [32m"NN"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([32m"My"[39m, [32m"PRP$"[39m),
    ([32m"life"[39m, [32m"NN"[39m),
    ([32m"in"[39m, [32m"IN"[39m),
    ([32m"Russia"[39m, [32m"NNP"[39m),
    ([32m"is"[39m, [32m"VBZ"[39m),
    ([32m"very"[39m, [32m"RB"[39m),
    ([32m"intersting"[39m, [32m"VBG"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([32m"John"[39m, [32m"NNP"[39m),
    ([32m"and"[39m, [32m"CC"[39m),
    ([32m"Peter"[39m, [32m"NNP"[39m),
    ([32m"are"[39m, [32m"VBP"[39m),
    ([32m"brthers"[39m, [32m"NNS"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([32m"However"[39m, [32m"RB"[3

In [11]:
for {
    t <- result.get("token")
    l <- result.get("lemmas")
    st <- result.get("stems")
    sp <- result.get("spell")
}yield t zip l zip st zip sp map { case (((t,l),st),sp) => (t,l,st,sp)}

[36mres10[39m: [32mOption[39m[[32mSeq[39m[([32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m)]] = [33mSome[39m(
  [33mList[39m(
    ([32m"Peter"[39m, [32m"Peter"[39m, [32m"peter"[39m, [32m"Peter"[39m),
    ([32m"is"[39m, [32m"be"[39m, [32m"i"[39m, [32m"is"[39m),
    ([32m"a"[39m, [32m"a"[39m, [32m"a"[39m, [32m"a"[39m),
    ([32m"very"[39m, [32m"very"[39m, [32m"veri"[39m, [32m"very"[39m),
    ([32m"good"[39m, [32m"good"[39m, [32m"good"[39m, [32m"good"[39m),
    ([32m"persn"[39m, [32m"person"[39m, [32m"person"[39m, [32m"person"[39m),
    ([32m"."[39m, [32m"."[39m, [32m"."[39m, [32m"."[39m),
    ([32m"My"[39m, [32m"My"[39m, [32m"my"[39m, [32m"My"[39m),
    ([32m"life"[39m, [32m"life"[39m, [32m"life"[39m, [32m"life"[39m),
    ([32m"in"[39m, [32m"in"[39m, [32m"in"[39m, [32m"in"[39m),
    ([32m"Russia"[39m, [32m"Russia"[39m, [32m"russia"[39m, [32m"Russia"[39m),
    ([32m"is

In [12]:
val dfOpt = for {
    t <- result.get("token")
    c <- result.get("spell")
    p <- result.get("pos")
    l <- result.get("lemmas")
    s <- result.get("stems")
}yield t zip c zip p zip l zip s map { case ((((t, c),p),l),s) => (t,c,p,l,s)}

val df = sparkSession.createDataFrame(dfOpt.get).toDF("token", "corrected", "POS", "lemmas", "stems")
df.show()

+----------+-----------+----+--------+--------+
|     token|  corrected| POS|  lemmas|   stems|
+----------+-----------+----+--------+--------+
|     Peter|      Peter| NNP|   Peter|   peter|
|        is|         is| VBZ|      be|       i|
|         a|          a|  DT|       a|       a|
|      very|       very|  RB|    very|    veri|
|      good|       good|  JJ|    good|    good|
|     persn|     person|  NN|  person|  person|
|         .|          .|   .|       .|       .|
|        My|         My|PRP$|      My|      my|
|      life|       life|  NN|    life|    life|
|        in|         in|  IN|      in|      in|
|    Russia|     Russia| NNP|  Russia|  russia|
|        is|         is| VBZ|      be|       i|
|      very|       very|  RB|    very|    veri|
|intersting|interesting| VBG|interest|interest|
|         .|          .|   .|       .|       .|
|      John|       John| NNP|    John|    john|
|       and|        and|  CC|     and|     and|
|     Peter|      Peter| NNP|   Peter|  

[36mdfOpt[39m: [32mOption[39m[[32mSeq[39m[([32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m)]] = [33mSome[39m(
  [33mList[39m(
    ([32m"Peter"[39m, [32m"Peter"[39m, [32m"NNP"[39m, [32m"Peter"[39m, [32m"peter"[39m),
    ([32m"is"[39m, [32m"is"[39m, [32m"VBZ"[39m, [32m"be"[39m, [32m"i"[39m),
    ([32m"a"[39m, [32m"a"[39m, [32m"DT"[39m, [32m"a"[39m, [32m"a"[39m),
    ([32m"very"[39m, [32m"very"[39m, [32m"RB"[39m, [32m"very"[39m, [32m"veri"[39m),
    ([32m"good"[39m, [32m"good"[39m, [32m"JJ"[39m, [32m"good"[39m, [32m"good"[39m),
    ([32m"persn"[39m, [32m"person"[39m, [32m"NN"[39m, [32m"person"[39m, [32m"person"[39m),
    ([32m"."[39m, [32m"."[39m, [32m"."[39m, [32m"."[39m, [32m"."[39m),
    ([32m"My"[39m, [32m"My"[39m, [32m"PRP$"[39m, [32m"My"[39m, [32m"my"[39m),
    ([32m"life"[39m, [32m"life"[39m, [32m"NN"[39m, [32m"life"[39m, [32m"life"[39m),
    

### Explain Document DL

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- NER (NER with GloVe 100D embeddings, CoNLL2003 dataset)
- Lemmatizer
- Stemmer
- Part of Speech
- SpellChecker (Norvig)


In [13]:
val pipeline_dl = PretrainedPipeline("explain_document_dl", lang="en")

explain_document_dl download started this may take some time.
Approximate size to download 169.3 MB
Download done! Loading the resource.


[36mpipeline_dl[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"explain_document_dl"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [14]:
pipeline_dl.model.stages

[36mres13[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mTransformer[39m] = [33mArray[39m(
  document_7939d5bf1083,
  SENTENCE_05265b07c745,
  REGEX_TOKENIZER_c5c312143f63,
  SPELL_e4ea67180337,
  LEMMATIZER_c62ad8f355f9,
  STEMMER_ba49f7631065,
  POS_d01c734956fe,
  WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
  NerDLModel_d4424c9af5f4,
  NER_CONVERTER_a81db9af2d23
)

In [15]:
val stageSize = pipeline_dl.model.stages.length
pipeline_dl.model.stages(stageSize-2).getClass.getClasses

[36mstageSize[39m: [32mInt[39m = [32m10[39m
[36mres14_1[39m: [32mArray[39m[[32mClass[39m[[32m?0[39m] forSome { type [32m?0[39m }] = [33mArray[39m(
  class com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$RowIdentifiedSentence,
  class com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$RowIdentifiedSentence$
)

In [16]:
val result = pipeline_dl.annotate(testDoc)

[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"entities"[39m -> [33mList[39m([32m"Peter"[39m, [32m"Russia"[39m, [32m"John"[39m, [32m"Peter"[39m, [32m"Lucas Dunbercker"[39m, [32m"Europe"[39m),
  [32m"stem"[39m -> [33mList[39m(
    [32m"peter"[39m,
    [32m"i"[39m,
    [32m"a"[39m,
    [32m"veri"[39m,
    [32m"good"[39m,
    [32m"person"[39m,
    [32m"."[39m,
    [32m"my"[39m,
    [32m"life"[39m,
    [32m"in"[39m,
    [32m"russia"[39m,
    [32m"i"[39m,
    [32m"veri"[39m,
    [32m"interest"[39m,
    [32m"."[39m,
    [32m"john"[39m,
    [32m"and"[39m,
    [32m"peter"[39m,
    [32m"ar"[39m,
    [32m"brother"[39m,
    [32m"."[39m,
    [32m"howev"[39m,
    [32m"thei"[39m,
    [32m"don't"[39m,
    [32m"support"[39m,
    [32m"each"[39m,
    [32m"other"[39m,
    [32m"that"[39m,
    [32m"much"[39m,
    [32m"."[39m,
    [32m"luca"[39m,
    [32m"dunberck"[

In [17]:
result.keySet

[36mres16[39m: [32mSet[39m[[32mString[39m] = [33mSet[39m(
  [32m"entities"[39m,
  [32m"stem"[39m,
  [32m"checked"[39m,
  [32m"lemma"[39m,
  [32m"document"[39m,
  [32m"pos"[39m,
  [32m"token"[39m,
  [32m"ner"[39m,
  [32m"embeddings"[39m,
  [32m"sentence"[39m
)

In [18]:
result.get("entities")

[36mres17[39m: [32mOption[39m[[32mSeq[39m[[32mString[39m]] = [33mSome[39m(
  [33mList[39m([32m"Peter"[39m, [32m"Russia"[39m, [32m"John"[39m, [32m"Peter"[39m, [32m"Lucas Dunbercker"[39m, [32m"Europe"[39m)
)

In [19]:
val dfOpt = for {
    t <- result.get("token")
    n <- result.get("ner")
    c <- result.get("checked")
    p <- result.get("pos")
    l <- result.get("lemma")
    s <- result.get("stem")
}yield t zip n zip c zip p zip l zip s map { case (((((t,n),c),p),l),s) => (t,n,c,p,l,s)}

val df = sparkSession.createDataFrame(dfOpt.get).toDF("token", "ner_label", "spell_corrected", "POS", "lemmas", "stems")
df.show()

+----------+---------+---------------+----+--------+--------+
|     token|ner_label|spell_corrected| POS|  lemmas|   stems|
+----------+---------+---------------+----+--------+--------+
|     Peter|    B-PER|          Peter| NNP|   Peter|   peter|
|        is|        O|             is| VBZ|      be|       i|
|         a|        O|              a|  DT|       a|       a|
|      very|        O|           very|  RB|    very|    veri|
|      good|        O|           good|  JJ|    good|    good|
|     persn|        O|         person|  NN|  person|  person|
|         .|        O|              .|   .|       .|       .|
|        My|        O|             My|PRP$|      My|      my|
|      life|        O|           life|  NN|    life|    life|
|        in|        O|             in|  IN|      in|      in|
|    Russia|    B-LOC|         Russia| NNP|  Russia|  russia|
|        is|        O|             is| VBZ|      be|       i|
|      very|        O|           very|  RB|    very|    veri|
|interst

[36mdfOpt[39m: [32mOption[39m[[32mSeq[39m[([32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m, [32mString[39m)]] = [33mSome[39m(
  [33mList[39m(
    ([32m"Peter"[39m, [32m"B-PER"[39m, [32m"Peter"[39m, [32m"NNP"[39m, [32m"Peter"[39m, [32m"peter"[39m),
    ([32m"is"[39m, [32m"O"[39m, [32m"is"[39m, [32m"VBZ"[39m, [32m"be"[39m, [32m"i"[39m),
    ([32m"a"[39m, [32m"O"[39m, [32m"a"[39m, [32m"DT"[39m, [32m"a"[39m, [32m"a"[39m),
    ([32m"very"[39m, [32m"O"[39m, [32m"very"[39m, [32m"RB"[39m, [32m"very"[39m, [32m"veri"[39m),
    ([32m"good"[39m, [32m"O"[39m, [32m"good"[39m, [32m"JJ"[39m, [32m"good"[39m, [32m"good"[39m),
    ([32m"persn"[39m, [32m"O"[39m, [32m"person"[39m, [32m"NN"[39m, [32m"person"[39m, [32m"person"[39m),
    ([32m"."[39m, [32m"O"[39m, [32m"."[39m, [32m"."[39m, [32m"."[39m, [32m"."[39m),
    ([32m"My"[39m, [32m"O"[39m, [32m"My"[39m, [32m"P

### Recognize Entities DL

In [20]:
val recognize_entities = PretrainedPipeline("recognize_entities_dl", lang="en")

recognize_entities_dl download started this may take some time.
Approximate size to download 160.1 MB
Download done! Loading the resource.


[36mrecognize_entities[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"recognize_entities_dl"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [21]:
recognize_entities.model.stages

[36mres20[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mTransformer[39m] = [33mArray[39m(
  document_1c58bc1aca5d,
  SENTENCE_328d8a47c1a8,
  REGEX_TOKENIZER_b6c4cbc5a4ea,
  WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
  NerDLModel_d4424c9af5f4,
  NER_CONVERTER_389b80afbf7d
)

In [22]:
val stageSize = recognize_entities.model.stages.length
recognize_entities.model.stages(stageSize-2).getClass.getClasses

[36mstageSize[39m: [32mInt[39m = [32m6[39m
[36mres21_1[39m: [32mArray[39m[[32mClass[39m[[32m?0[39m] forSome { type [32m?0[39m }] = [33mArray[39m(
  class com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$RowIdentifiedSentence,
  class com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel$RowIdentifiedSentence$
)

In [23]:
val testDoc = """
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""

val result = recognize_entities.annotate(testDoc)

for {
    t <- result.get("token")
    n <- result.get("ner")
}yield t zip n


[36mtestDoc[39m: [32mString[39m = [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"entities"[39m -> [33mList[39m([32m"Peter"[39m, [32m"Russia"[39m, [32m"John"[39m, [32m"Peter"[39m, [32m"Lucas Dunbercker"[39m, [32m"Europe"[39m),
  [32m"document"[39m -> [33mList[39m(
    [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
  ),
  [32m"token"[39m -> [33mList[39m(
    [32m"Peter"

### Clean Stop Words

In [24]:
val clean_stop = PretrainedPipeline("clean_stop", lang="en")

clean_stop download started this may take some time.
Approximate size to download 22.8 KB
Download done! Loading the resource.


[36mclean_stop[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"clean_stop"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [25]:
clean_stop.model.stages

[36mres24[39m: [32mArray[39m[[32morg[39m.[32mapache[39m.[32mspark[39m.[32mml[39m.[32mTransformer[39m] = [33mArray[39m(
  document_90b4be8a6e0b,
  SENTENCE_8ba1e4f73af0,
  REGEX_TOKENIZER_fb4f98b445ce,
  STOPWORDS_CLEANER_b5d381c851f5
)

In [26]:
val result = clean_stop.annotate(testDoc)
result.keySet

[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"document"[39m -> [33mList[39m(
    [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
  ),
  [32m"sentence"[39m -> [33mList[39m(
    [32m"Peter is a very good persn."[39m,
    [32m"My life in Russia is very intersting."[39m,
    [32m"John and Peter are brthers."[39m,
    [32m"However they don't support each other that much."[39m,
    [32m"Lucas Dunbercker is no longer happy."[39m,
    [32m"He has a good car though."[39m,
    [32m"Europe is very culture rich."[39m,
    [32m"There are huge churches!"[39m,
    [32m"and big houses!"[39m
  ),
  [32m"token"[39m -> [33mList[39m(
    [32m"Peter"[39m,
    [32m"is"[39m

In [27]:
result.get("cleanTokens").get.mkString(" ")

[36mres26[39m: [32mString[39m = [32m"Peter good persn . life Russia intersting . John Peter brthers . don't support . Lucas Dunbercker longer happy . good car . Europe culture rich . huge churches ! big houses !"[39m

### Spell Checker 

(Norvig Algo)

ref: https://norvig.com/spell-correct.html

In [28]:
val spell_checker = PretrainedPipeline("check_spelling", lang="en")

check_spelling download started this may take some time.
Approximate size to download 913.5 KB
Download done! Loading the resource.


[36mspell_checker[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"check_spelling"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [29]:
val testDoc = """
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""

val result = spell_checker.annotate(testDoc)

result.keySet

[36mtestDoc[39m: [32mString[39m = [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"document"[39m -> [33mList[39m(
    [32m"""
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
"""[39m
  ),
  [32m"sentence"[39m -> [33mList[39m(
    [32m"Peter is a very good persn."[39m,
    [32m"My life in Russia is very intersting."[39m,
    [32m"John and Peter are brthers."[39m,
    [32m"However they don't support

In [30]:
for {
    t <- result.get("token")
    c <- result.get("checked")
}yield t zip c

[36mres29[39m: [32mOption[39m[[32mSeq[39m[([32mString[39m, [32mString[39m)]] = [33mSome[39m(
  [33mList[39m(
    ([32m"Peter"[39m, [32m"Peter"[39m),
    ([32m"is"[39m, [32m"is"[39m),
    ([32m"a"[39m, [32m"a"[39m),
    ([32m"very"[39m, [32m"very"[39m),
    ([32m"good"[39m, [32m"good"[39m),
    ([32m"persn"[39m, [32m"person"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([32m"My"[39m, [32m"My"[39m),
    ([32m"life"[39m, [32m"life"[39m),
    ([32m"in"[39m, [32m"in"[39m),
    ([32m"Russia"[39m, [32m"Russia"[39m),
    ([32m"is"[39m, [32m"is"[39m),
    ([32m"very"[39m, [32m"very"[39m),
    ([32m"intersting"[39m, [32m"interesting"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([32m"John"[39m, [32m"John"[39m),
    ([32m"and"[39m, [32m"and"[39m),
    ([32m"Peter"[39m, [32m"Peter"[39m),
    ([32m"are"[39m, [32m"are"[39m),
    ([32m"brthers"[39m, [32m"brothers"[39m),
    ([32m"."[39m, [32m"."[39m),
    ([3

### Parsing a list of texts### Parsing a list of texts

In [31]:
val testDoc_list = Array("French author who helped pioner the science-fiction genre.",
"Verne wrate about space, air, and underwater travel before navigable aircrast",
"Practical submarines were invented, and before any means of space travel had been devised.")

testDoc_list

[36mtestDoc_list[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"French author who helped pioner the science-fiction genre."[39m,
  [32m"Verne wrate about space, air, and underwater travel before navigable aircrast"[39m,
  [32m"Practical submarines were invented, and before any means of space travel had been devised."[39m
)
[36mres30_1[39m: [32mArray[39m[[32mString[39m] = [33mArray[39m(
  [32m"French author who helped pioner the science-fiction genre."[39m,
  [32m"Verne wrate about space, air, and underwater travel before navigable aircrast"[39m,
  [32m"Practical submarines were invented, and before any means of space travel had been devised."[39m
)

In [32]:
val pipeline = PretrainedPipeline("explain_document_ml", lang="en")

[36mpipeline[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"explain_document_ml"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [33]:
val result_list = pipeline.annotate(testDoc_list)
result_list.length

[36mresult_list[39m: [32mArray[39m[[32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]]] = [33mArray[39m(
  [33mMap[39m(
    [32m"document"[39m -> [33mList[39m([32m"French author who helped pioner the science-fiction genre."[39m),
    [32m"spell"[39m -> [33mList[39m(
      [32m"French"[39m,
      [32m"author"[39m,
      [32m"who"[39m,
      [32m"helped"[39m,
      [32m"pioneer"[39m,
      [32m"the"[39m,
      [32m"sciencefiction"[39m,
      [32m"genre"[39m,
      [32m"."[39m
    ),
    [32m"pos"[39m -> [33mArrayBuffer[39m([32m"JJ"[39m, [32m"NN"[39m, [32m"WP"[39m, [32m"VBD"[39m, [32m"NN"[39m, [32m"DT"[39m, [32m"NN"[39m, [32m"NN"[39m, [32m"."[39m),
    [32m"lemmas"[39m -> [33mList[39m(
      [32m"French"[39m,
      [32m"author"[39m,
      [32m"who"[39m,
      [32m"help"[39m,
      [32m"pioneer"[39m,
      [32m"the"[39m,
      [32m"sciencefiction"[39m,
      [32m"genre"[39m,
      [32m"."[39m
    )

In [34]:
result_list.head

[36mres33[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"document"[39m -> [33mList[39m([32m"French author who helped pioner the science-fiction genre."[39m),
  [32m"spell"[39m -> [33mList[39m(
    [32m"French"[39m,
    [32m"author"[39m,
    [32m"who"[39m,
    [32m"helped"[39m,
    [32m"pioneer"[39m,
    [32m"the"[39m,
    [32m"sciencefiction"[39m,
    [32m"genre"[39m,
    [32m"."[39m
  ),
  [32m"pos"[39m -> [33mArrayBuffer[39m([32m"JJ"[39m, [32m"NN"[39m, [32m"WP"[39m, [32m"VBD"[39m, [32m"NN"[39m, [32m"DT"[39m, [32m"NN"[39m, [32m"NN"[39m, [32m"."[39m),
  [32m"lemmas"[39m -> [33mList[39m(
    [32m"French"[39m,
    [32m"author"[39m,
    [32m"who"[39m,
    [32m"help"[39m,
    [32m"pioneer"[39m,
    [32m"the"[39m,
    [32m"sciencefiction"[39m,
    [32m"genre"[39m,
    [32m"."[39m
  ),
  [32m"token"[39m -> [33mList[39m(
    [32m"French"[39m,
    [32m"author"[39m,
 

### Sentiment Analysis


#### Vivek algo

paper: `Fast and accurate sentiment classification using an enhanced Naive Bayes model`

https://arxiv.org/abs/1305.6143

code `https://github.com/vivekn/sentiment`

In [35]:
val sentiment = PretrainedPipeline("analyze_sentiment", lang="en")

analyze_sentiment download started this may take some time.
Approximate size to download 4.9 MB
Download done! Loading the resource.


[36msentiment[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"analyze_sentiment"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [36]:
val result = sentiment.annotate("The movie I watched today was not a good one")

result.get("sentiment")

[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"checked"[39m -> [33mList[39m(
    [32m"The"[39m,
    [32m"movie"[39m,
    [32m"I"[39m,
    [32m"watched"[39m,
    [32m"today"[39m,
    [32m"was"[39m,
    [32m"not"[39m,
    [32m"a"[39m,
    [32m"good"[39m,
    [32m"one"[39m
  ),
  [32m"document"[39m -> [33mList[39m([32m"The movie I watched today was not a good one"[39m),
  [32m"sentiment"[39m -> [33mList[39m([32m"negative"[39m),
  [32m"token"[39m -> [33mList[39m(
    [32m"The"[39m,
    [32m"movie"[39m,
    [32m"I"[39m,
    [32m"watched"[39m,
    [32m"today"[39m,
    [32m"was"[39m,
    [32m"not"[39m,
    [32m"a"[39m,
    [32m"good"[39m,
    [32m"one"[39m
  ),
  [32m"sentence"[39m -> [33mList[39m([32m"The movie I watched today was not a good one"[39m)
)
[36mres35_1[39m: [32mOption[39m[[32mSeq[39m[[32mString[39m]] = [33mSome[39m([33mList[39m([32m"negative"

#### DL version (trained on imdb)

In [36]:
// val sentiment_imdb = PretrainedPipeline("analyze_sentimentdl_use_imdb", lang="en")

In [37]:
val sentiment_imdb_glove = PretrainedPipeline("analyze_sentimentdl_glove_imdb", lang="en")

analyze_sentimentdl_glove_imdb download started this may take some time.
Approximate size to download 155.3 MB
Download done! Loading the resource.


[36msentiment_imdb_glove[39m: [32mPretrainedPipeline[39m = [33mPretrainedPipeline[39m(
  [32m"analyze_sentimentdl_glove_imdb"[39m,
  [32m"en"[39m,
  [32m"public/models"[39m,
  false,
  [32mNone[39m
)

In [38]:
val comment = """
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
"""

val result = sentiment_imdb_glove.annotate(comment)

result.get("sentiment")

[36mcomment[39m: [32mString[39m = [32m"""
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
"""[39m
[36mresult[39m: [32mMap[39m[[32mString[39m, [32mSeq[39m[[32mString[39m]] = [33mMap[39m(
  [32m"document"[39m -> [33mList[39m(
    [32m"""
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfe