In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

# We have access to Spark through the variable sc
nums = list(range(0, 1000001)) # this is on the driver machine

# it is called driver because it will tell the other machines what to do
# what spark will do is distribute this data across our other machines or cluster
# of machines that will do the processing for us to speed it up. cluster means
# your group of machines that are dedicated to performing this task.

# first thing we need to do is to tell to distribute what data
# normally you would load from a file, but for this tutorial we will just 
# use the list we made

# parallelize takes in a python list and distributes it into an RDD 
# (resilent distributed dataset). Every item in an RDD can be a
# number, list, tuple

nums_rdd = sc.parallelize(nums) # this distributes the data and returns an RDD to nums_rdd

nums_rdd.collect() # returns a python list of all of our info
# scary operation to do this because all of this data is distributed across our many
# different machines

# nums_rdd.take() takes a number of how many things we want to take
# this is safer than using collect()
nums_rdd.take(5) # this takes the first five elements of the nums_rdd RDD
# and returns it as a python list of the first five items

# if you want to apply a function to every element in the RDD, you would do:
squared_nums_rdd = nums_rdd.map(lambda x: x ** 2)
# .map() maps a function to every element in the RDD
# you can pass map a lambda function or a named function

# if we wanted to make every element a tuple where the first is the number
# and the second value is the number of digits, we would do:
pairs = nums_rdd.map(lambda x: (x, len(str(x))))
pairs.take(25) # will show us that this worked

# map is awesome, we will use it all the time, but one thing it does not do 
# is remove things from your RDD. To do that, we use filter
# So imagine we only want numbers with even digits, we would do:
even_pairs = pairs.filter(lambda x: x[1] % 2 == 0)
# just like map, we pass filter a function, but this time it returns a true or false
# REMEMEBER: x right now is a tuple, so x[0] is the number and x[1] is the number of digits