In [4]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [5]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tokens").getOrCreate()

In [6]:
from pyspark.ml.feature import Tokenizer

In [7]:
#create sample DtaFrame
dataframe = spark.createDataFrame([
                                  (0, "spark is great"),
                                  (1, "we are learning spark"),
                                  (2, "spark is better than hadoop no doubt")
                                  ], ["id", "sentence"])
dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      spark is great|
|  1|we are learning s...|
|  2|spark is better t...|
+---+--------------------+



In [8]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_6549f8862bf5

In [9]:
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |spark is great                      |[spark, is, great]                          |
|1  |we are learning spark               |[we, are, learning, spark]                  |
|2  |spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+



In [10]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [11]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [13]:
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

In [15]:
#create our tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

#transform DataFrame
tokenized_df = tokenizer.transform(dataframe)

#select the needed columns and don't truncate results
tokenized_df.withColumn("token", count_tokens(col("words"))).show(truncate=False)

+---+------------------------------------+--------------------------------------------+-----+
|id |sentence                            |words                                       |token|
+---+------------------------------------+--------------------------------------------+-----+
|0  |spark is great                      |[spark, is, great]                          |3    |
|1  |we are learning spark               |[we, are, learning, spark]                  |4    |
|2  |spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|7    |
+---+------------------------------------+--------------------------------------------+-----+

