In [None]:
import os
from IPython.display import clear_output
try:
  # Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
  # For example:
  # spark_version = 'spark-3.<enter version>'
  spark_version = 'spark-3.1.1'
  os.environ['SPARK_VERSION']=spark_version

  # Install Spark and Java
  !apt-get update
  !apt-get install openjdk-11-jdk-headless -qq > /dev/null
  !wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
  !tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
  !pip install -q findspark

  # Set Environment Variables
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
  os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
except:
  print(f"Error installing {spark_version}")
finally:
  clear_output()
  print(f'{spark_version} successfully installed')

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tokenizing").getOrCreate()

In [None]:
from pyspark.ml.feature import RegexTokenizer, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/data.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("data.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------------+
|                Poem|
+--------------------+
|This Autumn midnight|
|Orion’s at my window|
|shouting for his ...|
+--------------------+



In [None]:
# Tokenize DataFrame
tokened = Tokenizer(inputCol="Poem", outputCol="words")

In [6]:
# Transform DataFrame
tokenized = tokened.transform(df)
tokenized.show()

+--------------------+--------------------+
|                Poem|               words|
+--------------------+--------------------+
|This Autumn midnight|[this, autumn, mi...|
|Orion’s at my window|[orion’s, at, my,...|
|shouting for his ...|[shouting, for, h...|
+--------------------+--------------------+



In [None]:
# Create a Function to count vowels
def vowel_counter(words):
    vowel_count = 0

    for word in words:
        for letter in word:
            if letter in ('a', 'e', 'i', 'o', 'u'):
                vowel_count += 1

    return vowel_count

In [8]:
# Store a user defined function
count_vowels = udf(vowel_counter, IntegerType())
count_vowels

<function __main__.vowel_counter>

In [9]:
# Create new DataFrame with the udf
tokenized.select("Poem", "words")\
    .withColumn("vowels", count_vowels(col("words"))).show(truncate=False)

+---------------------+--------------------------+------+
|Poem                 |words                     |vowels|
+---------------------+--------------------------+------+
|This Autumn midnight |[this, autumn, midnight]  |6     |
|Orion’s at my window |[orion’s, at, my, window] |6     |
|shouting for his dog.|[shouting, for, his, dog.]|6     |
+---------------------+--------------------------+------+

