## Imports 
helpers is a local file, and the `SparkSession` class is a builder that starts spark on your machine

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from helpers import display

## Start spark

In [None]:
spark = (
    SparkSession
    .builder
    .appName('intro')
    .master('local[2]')
    .getOrCreate()
)

## Create a DataFrame

In [None]:
df = spark.createDataFrame(
    data=[
        ('cat', 'Fiona', 11),
        ('dog', 'Sophie', 7),
        ('dog', 'Shultz', 7),
        ('cat', 'Paddington', 1),
        ('dog', 'Belle', 4),
        ('goat', 'Byron', 2)
    ],
    schema=['animal', 'name', 'age']
)

### Dataframe:

In [None]:
df

In [None]:
df.show()

In [None]:
display(df, 10)

### Resilient Distributed Dataframe (RDD):

In [None]:
df.rdd

In [None]:
df.collect()

### Column object:

In [None]:
df.age

In [None]:
f.col('age')

In [None]:
display(
    df.select(f.col('age') + 2)
)

# (notice order difference)

### Partitions:

In [None]:
df.rdd.getNumPartitions()

In [None]:
df = df.repartition(4)

In [None]:
df.rdd.getNumPartitions()

### Lazy eval:

In [None]:
%%time

df.groupBy('animal').count()

In [None]:
%%time

display(df.groupBy('animal').count())

In [None]:
spark.stop()