In [None]:
# Example for Tokenizer
# Reference: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Tokenizer

In [None]:
"""
What is a Tokenizer?
    A Tokenizer is one which performs the task of Tokenization. 

Then what is Tokenization?
    It is a process of spliting (or breaking) a string (a stream of text) into 
        a) words
        b) phrases
        c) symbols
        d) any other meaningful portions(elements/splits)
    These splited portions(elements) are called Tokens.

What is RegexTokenizer?
    It is a tokenizer based on regex (regular expression).
    It extracts tokens by
        a) using the given regex pattern (default)
        b) repeatedly matching the regex
        
In PySpark, the Tokenizer first converts the text into lower case and then split it by white spaces.
"""

In [1]:
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
    .builder \
    .appName("Tokenizer_Example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
# creating a dataset
df = spark.createDataFrame([
    (1, "Hello World"),
    (2, "Welcome PySpark with Examples"),
    (3, "It's easy, but need regular practice"),
    (4, "Surely, you will enjoy! Spark runs on"),
    (5, "Hadoop,Aws,Mesos")
], ["S_No", "Text"])

In [4]:
# declaring Tokenizer
tokenizer = Tokenizer(inputCol = "Text", outputCol = "Splited_Words")

In [5]:
#declaring RegexTokenizer. W indicates white space.
r_tokenizer = RegexTokenizer(inputCol = "Text", outputCol = "Splited_words", pattern="\\W")

In [6]:
#count no of tokens
count_token = udf(lambda Splited_Words: len(Splited_Words), IntegerType())

In [7]:
# calling the tokenizer for the given data
# Also adding an additional calculated column "Tokens"
tk = tokenizer.transform(df)
tk.select("Text", "Splited_Words") \
    .withColumn("Tokens", count_token(col("Splited_Words"))).show(truncate=False)

+-------------------------------------+---------------------------------------------+------+
|Text                                 |Splited_Words                                |Tokens|
+-------------------------------------+---------------------------------------------+------+
|Hello World                          |[hello, world]                               |2     |
|Welcome PySpark with Examples        |[welcome, pyspark, with, examples]           |4     |
|It's easy, but need regular practice |[it's, easy,, but, need, regular, practice]  |6     |
|Surely, you will enjoy! Spark runs on|[surely,, you, will, enjoy!, spark, runs, on]|7     |
|Hadoop,Aws,Mesos                     |[hadoop,aws,mesos]                           |1     |
+-------------------------------------+---------------------------------------------+------+



In [8]:
# calling the regexTokenizer for the given data. 
# Also adding an additional calculated column "Tokens"
# In the above code we have used the truncate=False. what if we do not give it.
r_tk = r_tokenizer.transform(df)
r_tk.select("Text", "Splited_Words")\
    .withColumn("Tokens", count_token(col("Splited_Words"))).show()

+--------------------+--------------------+------+
|                Text|       Splited_Words|Tokens|
+--------------------+--------------------+------+
|         Hello World|      [hello, world]|     2|
|Welcome PySpark w...|[welcome, pyspark...|     4|
|It's easy, but ne...|[it, s, easy, but...|     7|
|Surely, you will ...|[surely, you, wil...|     7|
|    Hadoop,Aws,Mesos|[hadoop, aws, mesos]|     3|
+--------------------+--------------------+------+



In [9]:
# calling the regexTokenizer for the given data. 
# Also adding an additional calculated column "Tokens"
# with truncate=False
r_tk = r_tokenizer.transform(df)
r_tk.select("Text", "Splited_Words")\
    .withColumn("Tokens", count_token(col("Splited_Words"))).show(truncate=False)

+-------------------------------------+-------------------------------------------+------+
|Text                                 |Splited_Words                              |Tokens|
+-------------------------------------+-------------------------------------------+------+
|Hello World                          |[hello, world]                             |2     |
|Welcome PySpark with Examples        |[welcome, pyspark, with, examples]         |4     |
|It's easy, but need regular practice |[it, s, easy, but, need, regular, practice]|7     |
|Surely, you will enjoy! Spark runs on|[surely, you, will, enjoy, spark, runs, on]|7     |
|Hadoop,Aws,Mesos                     |[hadoop, aws, mesos]                       |3     |
+-------------------------------------+-------------------------------------------+------+



In [None]:
""" regex tokenizer considers even apostrophe. 
It also considers a string (a stream of text) without space but with.
The normal tokenizer does not do this.
"""