In [27]:
# Based on Spark 2.3.0 Doc https://spark.apache.org/docs/latest/rdd-programming-guide.html

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()
    
data = spark.read.csv("books.csv", header=True, inferSchema=True)
print(data.schema)
data.show()

books = data.rdd

StructType(List(StructField(title,StringType,true),StructField(author,StringType,true),StructField(type,StringType,true),StructField(price,IntegerType,true)))
+------------------+---------+--------+-----+
|             title|   author|    type|price|
+------------------+---------+--------+-----+
|      Where's Spot|Eric Hill|Children|   10|
|The Cat In The Hat|Dr. Seuss|Children|   15|
+------------------+---------+--------+-----+



In [21]:
# Map - apply a function to each element and return an RDD
results = books.map(lambda b: b.title + " by " + b.author).collect()
print(type(results))
print(results)

<class 'list'>
["Where's Spot by Eric Hill", 'The Cat In The Hat by Dr. Seuss']


In [19]:
# FlatMap - return a collection of the iterables returned by the function
results = books.flatMap(lambda b: b.title.split(' ')).collect()
print(type(results))
print(results)

<class 'list'>
["Where's", 'Spot', 'The', 'Cat', 'In', 'The', 'Hat']


In [28]:
results = books.filter(lambda b: b.price > 12).collect()
print(type(results))
print(results)

<class 'list'>
[Row(title='The Cat In The Hat', author='Dr. Seuss', type='Children', price=15)]


# Key Value Pair transformations

In [None]:
# groupByKey() - returns (K, Iterable<V>).  
# If the purpose of grouping is to perform an aggregation, use reduceByKey() instead

In [None]:
# reduceByKey() - returns (K, U)

# Create key value pair and use reduceByKey to perform a group by count
kv = books.map(lambda b: (b.type, 1)) # Create key value pair
countsRDD = kv.reduceByKey(lambda v1, v2: v1 + v2) # Nothing will be performed until an action is called
print(type(countsRDD))
print(countsRDD) # This is only a reference

counts = countsRDD.collect()
print(type(counts))
print(counts)