# Create Spark session

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("spark://127.0.0.1:7077")\
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
# local[*] means pseudo mode with all available CPU cores
# You can use spark://IP-address , the URL you find from Spark web ui
# to enable cluster mode, such as spark://JIAYU1AB6.localdomain:7077
# Make sure you shutdown and restart this notebook when switch modes

# Create DataFrames

In [8]:
# spark is an existing SparkSession
df = spark.read.json("/home/sparky/spark-3.0.3-bin-hadoop3.2/examples/src/main/resources/people.json")
# Displays the content of the DataFrame to stdout
df.show()
# +----+-------+
# | age|   name|
# +----+-------+
# |null|Michael|
# |  30|   Andy|
# |  19| Justin|
# +----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Untyped Dataset Operations (aka DataFrame Operations)

In [None]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()
# root
# |-- age: long (nullable = true)
# |-- name: string (nullable = true)

# Select only the "name" column
df.select("name").show()
# +-------+
# |   name|
# +-------+
# |Michael|
# |   Andy|
# | Justin|
# +-------+

# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()
# +-------+---------+
# |   name|(age + 1)|
# +-------+---------+
# |Michael|     null|
# |   Andy|       31|
# | Justin|       20|
# +-------+---------+

# Select people older than 21
df.filter(df['age'] > 21).show()
# +---+----+
# |age|name|
# +---+----+
# | 30|Andy|
# +---+----+

# Count people by age
df.groupBy("age").count().show()
# +----+-----+
# | age|count|
# +----+-----+
# |  19|    1|
# |null|    1|
# |  30|    1|
# +----+-----+

# Running SQL Queries Programmatically

In [None]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()
# +----+-------+
# | age|   name|
# +----+-------+
# |null|Michael|
# |  30|   Andy|
# |  19| Justin|
# +----+-------+

# Convert Spark DataFrame to Pandas DataFrame

In [None]:
pandasDf = sqlDF.toPandas()
print(pandasDf)

In [None]:
from scripts import raw_data