# Dataframe Operations

In [19]:
# Based on Spark 2.3.0 Doc

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

# Create a DataFrame
books = spark.read.csv("books.csv", header=True)
books.show()


+------------------+---------+--------+-----+
|             title|   author|    type|price|
+------------------+---------+--------+-----+
|      Where's Spot|Eric Hill|Children|   10|
|The Cat In The Hat|Dr. Seuss|Children|   15|
+------------------+---------+--------+-----+



In [9]:
# DataFrame operations

# Print Schema in tree format
books.printSchema()

root
 |-- title: string (nullable = true)
 |--  author: string (nullable = true)
 |--  type: string (nullable = true)
 |--  price: string (nullable = true)



In [10]:
# Show a single column
books.select("title").show()

+------------------+
|             title|
+------------------+
|      Where's Spot|
|The Cat In The Hat|
+------------------+



In [16]:
# Filter by a condition
books.filter(books["price"] > 12).show()

+------------------+---------+--------+-----+
|             title|   author|    type|price|
+------------------+---------+--------+-----+
|The Cat In The Hat|Dr. Seuss|Children|   15|
+------------------+---------+--------+-----+



In [20]:
# Count for each type group
books.groupBy("type").count().show()

+--------+-----+
|    type|count|
+--------+-----+
|Children|    2|
+--------+-----+



In [21]:
# SQL example

# Register as temporary view within this spark session
books.createOrReplaceTempView("books")
sqlDf = spark.sql("select * from books where price > 12")
sqlDf.show()

+------------------+---------+--------+-----+
|             title|   author|    type|price|
+------------------+---------+--------+-----+
|The Cat In The Hat|Dr. Seuss|Children|   15|
+------------------+---------+--------+-----+



In [24]:
# Saving a dataset
sqlDf = sqlDf.select("title", "author", "price").write.save("_results.csv")

In [29]:
# Convert to pandas dataframe
booksPd = books.toPandas()
booksPd.head()

Unnamed: 0,title,author,type,price
0,Where's Spot,Eric Hill,Children,10
1,The Cat In The Hat,Dr. Seuss,Children,15
