## Imports 
helpers is a local file, and the `SparkSession` class is a builder that starts spark on your machine

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from helpers import display

## Start spark

In [2]:
spark = (
    SparkSession
    .builder
    .appName('intro')
    .master('local[2]')
    .getOrCreate()
)

## Create a DataFrame

In [3]:
df = spark.createDataFrame(
    data=[
        ('cat', 'Fiona', 11),
        ('dog', 'Sophie', 7),
        ('dog', 'Shultz', 7),
        ('cat', 'Paddington', 1),
        ('dog', 'Belle', 4),
        ('goat', 'Byron', 2)
    ],
    schema=['animal', 'name', 'age']
)

### Dataframe:

In [4]:
df

DataFrame[animal: string, name: string, age: bigint]

In [5]:
df.show()

+------+----------+---+
|animal|      name|age|
+------+----------+---+
|   cat|     Fiona| 11|
|   dog|    Sophie|  7|
|   dog|    Shultz|  7|
|   cat|Paddington|  1|
|   dog|     Belle|  4|
|  goat|     Byron|  2|
+------+----------+---+



In [6]:
display(df, 10)

Unnamed: 0,animal,name,age
0,cat,Fiona,11
1,dog,Sophie,7
2,dog,Shultz,7
3,cat,Paddington,1
4,dog,Belle,4
5,goat,Byron,2


### Resilient Distributed Dataframe (RDD):

In [7]:
df.rdd

MapPartitionsRDD[14] at javaToPython at NativeMethodAccessorImpl.java:0

In [8]:
df.collect()

[Row(animal='cat', name='Fiona', age=11),
 Row(animal='dog', name='Sophie', age=7),
 Row(animal='dog', name='Shultz', age=7),
 Row(animal='cat', name='Paddington', age=1),
 Row(animal='dog', name='Belle', age=4),
 Row(animal='goat', name='Byron', age=2)]

### Column object:

In [9]:
df.age

Column<b'age'>

In [10]:
f.col('age')

Column<b'age'>

In [11]:
display(
    df.select(f.col('age') + 2)
)

# (notice order difference)

Unnamed: 0,(age + 2)
0,13
1,9
2,9
3,3
4,6


### Partitions:

In [12]:
df.rdd.getNumPartitions()

2

In [13]:
df = df.repartition(4)

In [14]:
df.rdd.getNumPartitions()

4

### Lazy eval:

In [15]:
%%time

df.groupBy('animal').count()

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 119 ms


DataFrame[animal: string, count: bigint]

In [16]:
%%time

display(df.groupBy('animal').count())

CPU times: user 30 ms, sys: 40 ms, total: 70 ms
Wall time: 5.38 s


Unnamed: 0,animal,count
0,dog,3
1,cat,2
2,goat,1


In [17]:
spark.stop()