<a href="https://colab.research.google.com/github/karenbennis/BigData/blob/master/Stop_Words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [None]:
# Create DataFrame
sentenceData = spark.createDataFrame([
    (0, ["Big", "data", "is", "super", "powerful"]),
    (1, ["This", "is", "going", "to", "be", "epic"])                                      
], ["id", "raw"])

sentenceData.show(truncate=False)

+---+--------------------------------+
|id |raw                             |
+---+--------------------------------+
|0  |[Big, data, is, super, powerful]|
|1  |[This, is, going, to, be, epic] |
+---+--------------------------------+



In [None]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

In [None]:
# Run the Remover
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

In [None]:
# Transform and show data
remover.transform(sentenceData).show(truncate=False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+



In [None]:
# SKILL DRILL Combine both tokenizer and StopWordsRemover on a DataFrame that isn’t already broken out into a list of words.

from pyspark.ml.feature import Tokenizer

# Create a sample DataFrame
sentenceSamples = spark.createDataFrame([
  (0, "Spark is great"),
  (1, "We are learning Spark"),
  (2, "Spark is better than hadoop no doubt")
], ["id", "sentence"])
sentenceSamples.show()

# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform and show DataFrame
tokenized_df = tokenizer.transform(sentenceSamples)
tokenized_df.show(truncate=False)

# Create a function to return the length of a list
def word_list_length(word_list):
	return len(word_list)

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

# Create our tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform DataFrame
tokenized_df = tokenizer.transform(sentenceSamples)

# Select the needed columns and don't truncate the results
tokenized_df.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

# Run the Remover
remover = StopWordsRemover(inputCol="words", outputCol="filteredSentence")

# Transform and show data
remover.transform(tokenized_df.withColumn("tokens", count_tokens(col("words")))).show(truncate=False)

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning Spark               |[we, are, learning, spark]                  |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+

+---+------------------------------------+--------------------------------------------+------+
|id |sentence                            |words                                       |tokens|
+--