In [1]:
import sys
sys.executable
import os

In [2]:
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home"

In [3]:
import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("Spark NLP")\
.master("local[4]")\
.config("spark.driver.memory","16G")\
.config("spark.driver.maxResultSize", "2G")\
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.4")\
.config("spark.kryoserializer.buffer.max","1000M")\
.getOrCreate()

import pandas as pd
import numpy as np
import scipy as sc
import sklearn as sk
import re

from pyspark.ml.feature import CountVectorizer, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, MapType, FloatType

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *

In [4]:
sparknlp.version()

'2.5.4'

In [5]:
# create a toy dataframe with text I'd like to tinker with

df = spark.createDataFrame([['1', 'A dish is also best thrown on a bat. No cylinder is made. The first step is to centre a flat disc of large diameter. The fingers are used as before to press down in the middle and are then drawn towards the edge preceded by a ridge of clay which gradually increases the diameter at each trip.'],                            
                            ['2', 'To me it seems absurd to present someone who is only a beginner with anything heavier to throw than one pound of clay. In fact I would tend to commence with even less. This is because the problem of controlling and centering increases rapidly as the size of the lump becomes heavier. Long, tedious attempts to master a big piece only lead to frustration, disappointment, and material which soon becomes too wet to manage anyway. In my opinion, a smaller wheel and less powerful motor is quite sufficient for most students and for many serious workers too.'],
                            ['3', 'Many kilns have elements running along the bottom as well as the sides and sometimes in the door and back. Even so, pots on the top shelf can easily be far cooler than those placed lower down in the kiln, If one superimposes on this additional discrepancy the ones we have just examined, it is easy to visualise that within the same firing chamber quite startling variations in temperature can occur.'],
                            ['4', 'Pieces which have been painted with slip can usually be picked up with reasonable safety, provided the hands are clean and free of dust, but any which have had pottery colours applied to the surface should be held at points away from the pigment or from the inside. If a colour is inadvertantly smudged the damage can aften be repaired; use a razor blade to scratch away the smear and then very carefully fill in again with the paint brush. Where a slipped surface is chipped or otherwise marked the piece at this stage will be too dry to correct with slip, and one can either try and make good straightaway with the nearest available pottery colour or wait until after the biscuit firing and use colour before dipping the piece in glaze.'],
                            ['5', 'When the pieces are being arranged, it is as well to remember that shelves should be as small as possible, consistent with their usefulness in supporting the ware. This is to avoid splitting the chamber into separate compartments. It is far better to allow the pots to overhang the edges a little and so permit the heat to circulate freely. A twelve inch square internal measurement will do best with a shelf no bigger than ten by ten.']],
                           ['rowkey', 'text'])

df.show()

+------+--------------------+
|rowkey|                text|
+------+--------------------+
|     1|A dish is also be...|
|     2|To me it seems ab...|
|     3|Many kilns have e...|
|     4|Pieces which have...|
|     5|When the pieces a...|
+------+--------------------+



In [6]:
# view summary of data frame
print(df.describe())
print((df.count(), len(df.columns)))

DataFrame[summary: string, rowkey: string, text: string]
(5, 2)


In [7]:
# document assember 
document_assembler = sparknlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\
.setCleanupMode("shrink_full")

# tokenizer 
tokenizer = sparknlp.annotator.Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")\
.setTargetPattern("\S+")\
.addInfixPattern("(/)(\\p{Alpha}+)")\
.addInfixPattern("(\\p{Alpha}+)(/)(\\p{Alpha}+)")\
.addInfixPattern("(\\p{Alpha}+)(\\.)(\\p{Upper}\\p{Alpha}+)")\
.addInfixPattern("(.+)(\\.)\\z")\
.addInfixPattern("(.+)([,:/])\\z")\
.addInfixPattern("(\\()(.+)(\\))")\
.addInfixPattern("(\\()(.+)")\
.addInfixPattern("(.+)(\\))")\
.addException("New York")

# context dependent spell checker
spell_checker = sparknlp.annotator.ContextSpellCheckerApproach()\
.setInputCols(["token"])\
.setOutputCol("spell")\
.setLanguageModelClasses(1000)\
.setWordMaxDistance(6)\
.setEpochs(2)

# pipeline
pipeline =  Pipeline().setStages([document_assembler,
                                          tokenizer, 
                                          spell_checker
                                         ])

model = pipeline.fit(df)

In [8]:
# punctuation is not tokenized separately from the bi-grams in the exception list
# Is this a bug?
lp = LightPipeline(model)
lp.annotate("My friend moved to New York. She likes it. Frank visited New York, and didn't like it.")

{'document': ["My friend moved to New York. She likes it. Frank visited New York, and didn't like it."],
 'token': ['My',
  'friend',
  'moved',
  'to',
  'New York.',
  'She',
  'likes',
  'it',
  '.',
  'Frank',
  'visited',
  'New York,',
  'and',
  "didn't",
  'like',
  'it',
  '.'],
 'spell': ['to',
  'and',
  'have',
  'to',
  'New York.',
  'the',
  'is',
  'it',
  '.',
  'and',
  'is',
  'New York,',
  'and',
  'it',
  'the',
  'it',
  '.']}

In [9]:
# sentence detector
sentence_detector = SentenceDetector()\
.setInputCols("document")\
.setOutputCol("sentence")

# tokenizer 
tokenizerS = sparknlp.annotator.Tokenizer()\
.setInputCols("sentence")\
.setOutputCol("token")\
.setTargetPattern("\S+")\
.addInfixPattern("(/)(\\p{Alpha}+)")\
.addInfixPattern("(\\p{Alpha}+)(/)(\\p{Alpha}+)")\
.addInfixPattern("(\\p{Alpha}+)(\\.)(\\p{Upper}\\p{Alpha}+)")\
.addInfixPattern("(.+)(\\.)\\z")\
.addInfixPattern("(.+)([,:/])\\z")\
.addInfixPattern("(\\()(.+)(\\))")\
.addInfixPattern("(\\()(.+)")\
.addInfixPattern("(.+)(\\))")\
.addException("New York")

# pipeline
pipeline1 =  Pipeline().setStages([document_assembler,
                                  sentence_detector,
                                  tokenizerS, 
                                  spell_checker
                                 ])

model1 = pipeline1.fit(df)

In [10]:
lp1 = LightPipeline(model1)
lp1.annotate("My friend moved to New York. She likes it. Frank visited New York, and didn't like it.")

{'document': ["My friend moved to New York. She likes it. Frank visited New York, and didn't like it."],
 'sentence': ['My friend moved to New York.',
  'She likes it.',
  "Frank visited New York, and didn't like it."],
 'token': ['My',
  'friend',
  'moved',
  'to',
  'New York.',
  'She',
  'likes',
  'it',
  '.',
  'Frank',
  'visited',
  'and',
  "didn't",
  'like',
  'it',
  '.'],
 'spell': ['and',
  'is',
  'and',
  'in',
  'the',
  'it',
  '.',
  'the',
  'is',
  'it',
  '.',
  '.',
  'and',
  'one',
  'to',
  'New York.']}