For help, look here:
https://spark.apache.org/docs/latest/rdd-programming-guide.html

In [0]:
# Check out the pre-loaded dataset
display(dbutils.fs.ls('dbfs:/databricks-datasets/'))

path,name,size,modificationTime
dbfs:/databricks-datasets/COVID/,COVID/,0,0
dbfs:/databricks-datasets/README.md,README.md,976,1532468253000
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359,1455043490000
dbfs:/databricks-datasets/adult/,adult/,0,0
dbfs:/databricks-datasets/airlines/,airlines/,0,0
dbfs:/databricks-datasets/amazon/,amazon/,0,0
dbfs:/databricks-datasets/asa/,asa/,0,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0,0
dbfs:/databricks-datasets/bikeSharing/,bikeSharing/,0,0


## 1. Word Count

In [0]:
# Create a rdd (sc = SparkContext)
rdd = sc.textFile("dbfs:/databricks-datasets/SPARK_README.md")

In [0]:
# Read 20 lines 
rdd.take(20)

Out[3]: ['# Apache Spark',
 '',
 'Spark is a fast and general cluster computing system for Big Data. It provides',
 'high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
 'supports general computation graphs for data analysis. It also supports a',
 'rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
 'MLlib for machine learning, GraphX for graph processing,',
 'and Spark Streaming for stream processing.',
 '',
 '<http://spark.apache.org/>',
 '',
 '',
 '## Online Documentation',
 '',
 'You can find the latest Spark documentation, including a programming',
 'guide, on the [project web page](http://spark.apache.org/documentation.html)',
 'and [project wiki](https://cwiki.apache.org/confluence/display/SPARK).',
 'This README file only contains basic setup instructions.',
 '',
 '## Building Spark']

In [0]:
# Example: lambda functions  
words = rdd.flatMap(lambda lines: lines.split(" "))

for w in words.collect():
  print(w)

#
Apache
Spark

Spark
is
a
fast
and
general
cluster
computing
system
for
Big
Data.
It
provides
high-level
APIs
in
Scala,
Java,
Python,
and
R,
and
an
optimized
engine
that
supports
general
computation
graphs
for
data
analysis.
It
also
supports
a
rich
set
of
higher-level
tools
including
Spark
SQL
for
SQL
and
DataFrames,
MLlib
for
machine
learning,
GraphX
for
graph
processing,
and
Spark
Streaming
for
stream
processing.

<http://spark.apache.org/>


##
Online
Documentation

You
can
find
the
latest
Spark
documentation,
including
a
programming
guide,
on
the
[project
web
page](http://spark.apache.org/documentation.html)
and
[project
wiki](https://cwiki.apache.org/confluence/display/SPARK).
This
README
file
only
contains
basic
setup
instructions.

##
Building
Spark

Spark
is
built
using
[Apache
Maven](http://maven.apache.org/).
To
build
Spark
and
its
example
programs,
run:





build/mvn
-DskipTests
clean
package

(You
do
not
need
to
do
this
if
you
downloaded
a
pre-built
package.)
More
detaile

In [0]:
# Take the previous function and
# 1. count the occurence of each word
# Map each word to a key-value pair (word, 1)
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key to count the occurrences of each word
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Collect and print the word counts
for word, count in word_counts.collect():
    print(f"{word}: {count}")

#: 1
Apache: 1
Spark: 13
: 67
is: 6
It: 2
provides: 1
high-level: 1
APIs: 1
in: 5
Scala,: 1
Java,: 1
an: 3
optimized: 1
engine: 1
supports: 2
computation: 1
analysis.: 1
set: 2
of: 5
tools: 1
SQL: 2
MLlib: 1
machine: 1
learning,: 1
GraphX: 1
graph: 1
processing,: 1
Documentation: 1
latest: 1
programming: 1
guide,: 1
[project: 2
README: 1
only: 1
basic: 1
instructions.: 1
Building: 1
using: 2
[Apache: 1
run:: 1
do: 2
this: 1
downloaded: 1
documentation: 3
project: 1
site,: 1
at: 2
Spark"](http://spark.apache.org/docs/latest/building-spark.html).: 1
Interactive: 2
Shell: 2
The: 1
way: 1
start: 1
Try: 1
following: 2
1000:: 2
scala>: 1
1000).count(): 1
Python: 2
Alternatively,: 1
use: 3
And: 1
run: 7
Example: 1
several: 1
programs: 2
them,: 1
`./bin/run-example: 1
[params]`.: 1
example:: 1
./bin/run-example: 2
SparkPi: 2
variable: 1
when: 1
examples: 2
spark://: 1
URL,: 1
YARN,: 1
"local": 1
locally: 2
N: 1
abbreviated: 1
class: 2
name: 1
package.: 1
instance:: 1
print: 1
usage: 1
help: 1


In [0]:
# 2. change all capital letters to lower case
# Convert each word to lowercase
words = rdd.flatMap(lambda lines: lines.split(" ")).map(lambda word: word.lower())
words.collect()

Out[6]: ['#',
 'apache',
 'spark',
 '',
 'spark',
 'is',
 'a',
 'fast',
 'and',
 'general',
 'cluster',
 'computing',
 'system',
 'for',
 'big',
 'data.',
 'it',
 'provides',
 'high-level',
 'apis',
 'in',
 'scala,',
 'java,',
 'python,',
 'and',
 'r,',
 'and',
 'an',
 'optimized',
 'engine',
 'that',
 'supports',
 'general',
 'computation',
 'graphs',
 'for',
 'data',
 'analysis.',
 'it',
 'also',
 'supports',
 'a',
 'rich',
 'set',
 'of',
 'higher-level',
 'tools',
 'including',
 'spark',
 'sql',
 'for',
 'sql',
 'and',
 'dataframes,',
 'mllib',
 'for',
 'machine',
 'learning,',
 'graphx',
 'for',
 'graph',
 'processing,',
 'and',
 'spark',
 'streaming',
 'for',
 'stream',
 'processing.',
 '',
 '<http://spark.apache.org/>',
 '',
 '',
 '##',
 'online',
 'documentation',
 '',
 'you',
 'can',
 'find',
 'the',
 'latest',
 'spark',
 'documentation,',
 'including',
 'a',
 'programming',
 'guide,',
 'on',
 'the',
 '[project',
 'web',
 'page](http://spark.apache.org/documentation.html)',
 'a

In [0]:
# 3. eliminate stopwords 
stop_words = ['and', 'to', 'in', 'at', 'the', 'an']

words = rdd.flatMap(lambda lines: lines.split(" ")) \
           .map(lambda word: word.lower()) \
           .filter(lambda word: word not in stop_words)
words.collect()

Out[7]: ['#',
 'apache',
 'spark',
 '',
 'spark',
 'is',
 'a',
 'fast',
 'general',
 'cluster',
 'computing',
 'system',
 'for',
 'big',
 'data.',
 'it',
 'provides',
 'high-level',
 'apis',
 'scala,',
 'java,',
 'python,',
 'r,',
 'optimized',
 'engine',
 'that',
 'supports',
 'general',
 'computation',
 'graphs',
 'for',
 'data',
 'analysis.',
 'it',
 'also',
 'supports',
 'a',
 'rich',
 'set',
 'of',
 'higher-level',
 'tools',
 'including',
 'spark',
 'sql',
 'for',
 'sql',
 'dataframes,',
 'mllib',
 'for',
 'machine',
 'learning,',
 'graphx',
 'for',
 'graph',
 'processing,',
 'spark',
 'streaming',
 'for',
 'stream',
 'processing.',
 '',
 '<http://spark.apache.org/>',
 '',
 '',
 '##',
 'online',
 'documentation',
 '',
 'you',
 'can',
 'find',
 'latest',
 'spark',
 'documentation,',
 'including',
 'a',
 'programming',
 'guide,',
 'on',
 '[project',
 'web',
 'page](http://spark.apache.org/documentation.html)',
 '[project',
 'wiki](https://cwiki.apache.org/confluence/display/spark).'

In [0]:
# 4. sort in alphabetical order
words = words.sortBy(lambda word: word)
words.collect()

Out[8]: ['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '"local"',
 '"local[n]"',
 '"yarn"',
 '#',
 '##',
 '##',
 '##',
 '##',
 '##',
 '##',
 '##',
 '##',
 '(you',
 '-dskiptests',
 './bin/pyspark',
 './bin/run-example',
 './bin/run-example',
 './bin/spark-shell',
 './dev/run-tests',
 '1000).count()',
 '1000:',
 '1000:',
 '<class>',
 '<http://spark.apache.org/>',
 '>>>',
 '["building',
 '["specifying',
 '[apache',
 '[building',
 '[configuration',
 '[params]`.',
 '[project',
 '[project',
 '[run',
 '`./bin/run-example',
 '`examples`',
 '`examples`',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'abbreviated',
 'about',
 'against',
 'also',
 'also',
 'also',
 'also',
 'alternatively,',
 'analysis.'

In [0]:
# 5. sort from most to least frequent word
# Sort by frequency (most to least)
word_counts = word_pairs.reduceByKey(lambda a, b: a + b)

# Sort by count in descending order
sorted_word_counts = word_counts.sortBy(lambda x: x[1], ascending=False)

In [0]:
# 6.** remove punctuations 
import string

# Remove punctuation from each word
words = words.map(lambda word: word.translate(str.maketrans('', '', string.punctuation)))
words.collect()

Out[10]: ['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'local',
 'localn',
 'yarn',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'you',
 'dskiptests',
 'binpyspark',
 'binrunexample',
 'binrunexample',
 'binsparkshell',
 'devruntests',
 '1000count',
 '1000',
 '1000',
 'class',
 'httpsparkapacheorg',
 '',
 'building',
 'specifying',
 'apache',
 'building',
 'configuration',
 'params',
 'project',
 'project',
 'run',
 'binrunexample',
 'examples',
 'examples',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'abbreviated',
 'about',
 'against',
 'also',
 'also',
 'also',
 'also',
 'alternatively',
 'analysis',
 'apache',
 'apis',
 'are',
 'available',
 'basic',
 'be',
 'be',
 'because',
 'big',
 

## 2. What does it do?