In [1]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.0.1'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz

!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-3.0.1-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType
  
# Read in data from S3 Buckets
spark = SparkSession.builder.appName("PeterPan").getOrCreate()
  
df = spark.read.text("peterpan.txt")
# Show DataFrame
df.show()

+--------------------+
|               value|
+--------------------+
|All children, exc...|
|Of course they li...|
|The way Mr. Darli...|
|Mr. Darling used ...|
|Mrs. Darling was ...|
|Wendy came first,...|
|For a week or two...|
|“Now don't interr...|
|“I have one pound...|
|“Of course we can...|
|“Remember mumps,”...|
|There was the sam...|
|Mrs. Darling love...|
|No nursery could ...|
|He had his positi...|
|Nana also trouble...|
|Mrs. Darling firs...|
|I don't know whet...|
|Of course the Nev...|
|Of all delectable...|
+--------------------+
only showing top 20 rows



In [3]:
# Using PySpark and lambda, write code that produces the count for each word in the `Peter Pan` book
count_rdd = df.select("value").rdd.flatMap(lambda x: x[0].split(' ')).map(lambda x: (x, 1)).reduceByKey(lambda x,y: x+y)

In [4]:
count_rdd.collect()

[('All', 17),
 ('children,', 6),
 ('except', 19),
 ('one,', 15),
 ('grow', 8),
 ('up.', 14),
 ('They', 101),
 ('soon', 26),
 ('know', 64),
 ('that', 551),
 ('they', 463),
 ('will', 78),
 ('up,', 17),
 ('and', 1323),
 ('the', 2152),
 ('way', 58),
 ('Wendy', 199),
 ('knew', 63),
 ('was', 897),
 ('this.', 6),
 ('One', 9),
 ('day', 11),
 ('when', 151),
 ('she', 465),
 ('two', 36),
 ('years', 3),
 ('old', 22),
 ('playing', 10),
 ('in', 623),
 ('a', 902),
 ('garden,', 1),
 ('plucked', 2),
 ('another', 24),
 ('flower', 2),
 ('ran', 17),
 ('with', 312),
 ('it', 463),
 ('to', 1139),
 ('her', 361),
 ('mother.', 9),
 ('I', 253),
 ('suppose', 7),
 ('must', 59),
 ('have', 243),
 ('looked', 33),
 ('rather', 40),
 ('delightful,', 1),
 ('for', 355),
 ('Mrs.', 72),
 ('Darling', 93),
 ('put', 40),
 ('hand', 32),
 ('heart', 13),
 ('cried,', 42),
 ('“Oh,', 24),
 ('why', 13),
 ("can't", 24),
 ('you', 351),
 ('remain', 4),
 ('like', 86),
 ('this', 154),
 ('ever!”', 1),
 ('This', 25),
 ('all', 202),
 ('passe

In [5]:
count_rdd.toDF().show()

+---------+----+
|       _1|  _2|
+---------+----+
|      All|  17|
|children,|   6|
|   except|  19|
|     one,|  15|
|     grow|   8|
|      up.|  14|
|     They| 101|
|     soon|  26|
|     know|  64|
|     that| 551|
|     they| 463|
|     will|  78|
|      up,|  17|
|      and|1323|
|      the|2152|
|      way|  58|
|    Wendy| 199|
|     knew|  63|
|      was| 897|
|    this.|   6|
+---------+----+
only showing top 20 rows



In [7]:
count_rdd.toDF().write.csv('peter_pan_output.csv')