In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext.getOrCreate()

Parallelized collections are created by calling SparkContext’s parallelize 
method on an existing iterable or collection in the driver program. The elements 
of the collection are copied to form a distributed dataset that can be operated 
on in parallel. For example, here is how to create a parallelized collection 
holding the numbers 1 to 5:

In [3]:
data = [1,2,3,4,5]
distData = sc.parallelize(data).collect()
distData

[1, 2, 3, 4, 5]

In [4]:
distData = sc.parallelize(data)
distData.reduce(lambda x, y: x + y)

15

## DataFrame Operations

In [2]:
import pyspark as spark
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName('Python Spark SQL example') \
        .config("spark.some.config.option", 'some-value') \
        .getOrCreate()

In [3]:
df = spark.read.json("people.json")
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.select('name').show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [6]:
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [7]:
df.filter(df['age'] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [8]:
df.groupBy('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



## SQL Queries

In [11]:
df.createOrReplaceTempView("people")
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+---+
|age|
+---+
| 30|
+---+

