In [9]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [10]:
## Create Spark Session
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [19]:
## Setup Schema - Manually set the DF schema / Enforce specific field types when used
schema = StructType(fields=[StructField('age', IntegerType(), True),
                            StructField('name', StringType(), True)])

In [20]:
## Read in Data - Will guess at structure & data types
df = spark.read.json('gs://spark-training-data/datasets/people.json', schema=schema)

In [21]:
## Show Data
df.show()
df.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [22]:
## Show column names (attribute)
df.columns

['age', 'name']

In [23]:
## Statistical Summary of df - Must add .show() to see results
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [26]:
## Accessing Data - df['age'] will just return the type
## Use df.select('some column').show() to actually see the data
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [28]:
## df.head() Example
df.head(2) # Returns list of row objects and can be indexed
df.head(2)[0]

Row(age=None, name='Michael')

In [29]:
## Multiple column select
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [32]:
## Adding new columns - Returns new df with the new column (not in place)
## Would have to overwrite existing df or create new df if wanted to keep
df.withColumn('double_age', df['age'] * 2).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [33]:
## Renaming a column - Strings only - (not in place)
df.withColumnRenamed('age', 'ny_new_age').show()

+----------+-------+
|ny_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [38]:
## Pure SQL example if analysts already know SQL
## Register the df as a temporary view
df.createOrReplaceTempView('people')
results = spark.sql('SELECT * FROM people WHERE age = 30')
results.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

