In [1]:
import pyspark as ps

In [2]:
# Establish Spark session

spark = ps.sql.SparkSession.builder \
            .master("local[2]") \
            .appName("df lecture") \
            .getOrCreate()

sc = spark.sparkContext 
print("woot")

woot


## **SQL Window Functions** ##
* Start spark session
* Execute SQL queries
* Next: Add notes and continue

In [None]:
# read CSV
df_sales = spark.read.csv('data/sales.csv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep=",",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?

# Now create an SQL table and issue SQL queries against it without
# using the sqlContext but through the SparkSession object.
# Creates a temporary view of the DataFrame
df_sales.createOrReplaceTempView("sales")

In [None]:
result = spark.sql('''
    SELECT state, AVG(amount) as avg_amount
    FROM sales
    GROUP BY state
    ''')
result.show()

In [16]:
df_bike = spark.read.csv('data/2012Q1-capitalbikeshare-tripdata.csv', header=True, quote='"', sep=',', inferSchema=False)
df_bike.createOrReplaceTempView('bike_data')

In [17]:
result = spark.sql('''
    SELECT * FROM bike_data LIMIT 5
    ''')
result.show()

+----------------+-----------+-----------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|duration_seconds| start_time|   end_time|start_station_number|       start_station|end_station_number|         end_station|bike_number|member_type|
+----------------+-----------+-----------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|             475|1/1/12 0:04|1/1/12 0:11|               31245|7th & R St NW / S...|             31109|       7th & T St NW|     W01412|     Member|
|            1162|1/1/12 0:10|1/1/12 0:29|               31400|Georgia & New Ham...|             31103|16th & Harvard St NW|     W00524|     Casual|
|            1145|1/1/12 0:10|1/1/12 0:29|               31400|Georgia & New Ham...|             31103|16th & Harvard St NW|     W00235|     Member|
|             485|1/1/12 0:15|1/1/12 0:23|               31101|      14th & V St NW|             31602|Par

In [18]:
result = spark.sql('''
    SELECT duration_seconds,
       SUM(duration_seconds) OVER (ORDER BY start_time) AS running_total
    FROM bike_data
    ''')
result.show()

+----------------+-------------+
|duration_seconds|running_total|
+----------------+-------------+
|             475|        475.0|
|            1162|       2782.0|
|            1145|       2782.0|
|             485|       3738.0|
|             471|       3738.0|
|             358|       4096.0|
|            1754|       5850.0|
|             259|       6109.0|
|             516|       6625.0|
|             913|       7538.0|
|            1097|       8635.0|
|             490|       9125.0|
|            1045|      11205.0|
|            1035|      11205.0|
|            1060|      14063.0|
|            1039|      14063.0|
|             443|      14063.0|
|             316|      14063.0|
|             506|      14569.0|
|             956|      15525.0|
+----------------+-------------+
only showing top 20 rows



In [21]:
result = spark.sql('''
   SELECT start_station_number,
       duration_seconds,
       SUM(duration_seconds) OVER
         (PARTITION BY start_station_number ORDER BY start_time)
         AS running_total
   FROM bike_data
   WHERE start_time < '2012-01-08'
   ''')
result.show()

+--------------------+----------------+-------------+
|start_station_number|duration_seconds|running_total|
+--------------------+----------------+-------------+
|               31217|             841|       1613.0|
|               31217|             772|       1613.0|
|               31217|            1623|       3236.0|
|               31217|            1260|       5751.0|
|               31217|            1255|       5751.0|
|               31217|            5154|      12076.0|
|               31217|            1171|      12076.0|
|               31217|            4880|      16956.0|
|               31217|             531|      17487.0|
|               31217|            8831|      26318.0|
|               31217|            8684|      35002.0|
|               31217|            8681|      43683.0|
|               31217|            8528|      52211.0|
|               31217|             881|      53092.0|
|               31217|             858|      53950.0|
|               31217|      