In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.1 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("workshop-analytics") \
    .config("spark.master", "spark://sparkmaster:7077")\
    .config("spark.cassandra.connection.host", "node1")\
    .getOrCreate()

In [3]:
ct = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="generation", keyspace="energydata")\
.load()

In [4]:
ct.explain()

== Physical Plan ==
*Scan org.apache.spark.sql.cassandra.CassandraSourceRelation@10b23498 [region#0,type#1,ts#2,value#3] ReadSchema: struct<region:string,type:string,ts:timestamp,value:double>


In [5]:
# ctpd = ct.filter("ts < cast('2012-01-11' as timestamp)")
#ctpd.explain()

Grouped Aggregates

In [7]:
from pyspark.sql.functions import *
ct_agg = ct \
    .withColumn('year', year(ct.ts)) \
    .withColumn('month', month(ct.ts)) \
    .filter("type == 'solar' AND region == 'DE'") \
    .groupBy('type', 'region', 'year') \
    .agg( \
        mean("value").alias("mean_generation"),
        max("value").alias("max_generation"),
        sum("value").alias("sum_generation")
        ) \
    .sort(desc('sum_generation'))

In [8]:
#ct_agg.explain()
%time ct_agg.show()

+-----+------+----+------------------+--------------+--------------+
| type|region|year|   mean_generation|max_generation|sum_generation|
+-----+------+----+------------------+--------------+--------------+
|solar|    DE|2015|3984.8301369863016|       25928.0|  1.39628448E8|
|solar|    DE|2016| 3935.198906979392|       26252.0|  1.38251408E8|
|solar|    DE|2014| 3738.812263284747|       24244.0|  1.30305085E8|
|solar|    DE|2013|3389.9191370035483|       23998.0|  1.18470894E8|
|solar|    DE|2012|3174.7929758652094|       22402.0|  1.11549526E8|
|solar|    DE|2011|               0.0|           0.0|           0.0|
+-----+------+----+------------------+--------------+--------------+

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 4.57 s


In [11]:
ct_station = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="weather_station", keyspace="energydata")\
.load()

ct_sensor = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="weather_sensor", keyspace="energydata")\
.load()

In [30]:
station_subset = ct_station\
.filter("lat < 50 and lon > 10") 

sensor_subset = ct_sensor.filter("sensor=='h2'")

In [31]:
sensor_subset.count()

2248704

In [36]:
station_subset.count()

40

Joining with the Datasets API is limited, joins are not pushed down

You would need to go with the RDD API (Scala only) using rdd.joinWithCassandraTable()

In [35]:
from pyspark.sql.functions import *

%time station_subset.join(sensor_subset, station_subset.id == sensor_subset.id).count()

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 44.7 s


351360