In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.1 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("workshop-analytics") \
    .config("spark.master", "spark://sparkmaster:7077")\
    .config("spark.cassandra.connection.host", "node1")\
    .getOrCreate()

In [4]:
ct = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="generation", keyspace="energydata")\
.load()

In [5]:
ct.explain()


== Physical Plan ==
*Scan org.apache.spark.sql.cassandra.CassandraSourceRelation@785f32c2 [region#9,type#10,ts#11,value#12] ReadSchema: struct<region:string,type:string,ts:timestamp,value:double>


In [17]:
ct.show()

+------+----+-------------------+-------+
|region|type|                 ts|  value|
+------+----+-------------------+-------+
|    DE|wind|2016-12-31 22:45:00|15236.0|
|    DE|wind|2016-12-31 22:30:00|15074.0|
|    DE|wind|2016-12-31 22:15:00|14997.0|
|    DE|wind|2016-12-31 22:00:00|14915.0|
|    DE|wind|2016-12-31 21:45:00|14880.0|
|    DE|wind|2016-12-31 21:30:00|14876.0|
|    DE|wind|2016-12-31 21:15:00|14818.0|
|    DE|wind|2016-12-31 21:00:00|14816.0|
|    DE|wind|2016-12-31 20:45:00|15294.0|
|    DE|wind|2016-12-31 20:30:00|15449.0|
|    DE|wind|2016-12-31 20:15:00|15391.0|
|    DE|wind|2016-12-31 20:00:00|15284.0|
|    DE|wind|2016-12-31 19:45:00|15292.0|
|    DE|wind|2016-12-31 19:30:00|15268.0|
|    DE|wind|2016-12-31 19:15:00|15290.0|
|    DE|wind|2016-12-31 19:00:00|15361.0|
|    DE|wind|2016-12-31 18:45:00|15371.0|
|    DE|wind|2016-12-31 18:30:00|15251.0|
|    DE|wind|2016-12-31 18:15:00|15126.0|
|    DE|wind|2016-12-31 18:00:00|15034.0|
+------+----+-------------------+-

In [None]:
# ctpd = ct.filter("ts < cast('2012-01-11' as timestamp)")
#ctpd.explain()

Grouped Aggregates

In [6]:
from pyspark.sql.functions import *
ct_agg = ct \
    .withColumn('year', year(ct.ts)) \
    .withColumn('month', month(ct.ts)) \
    .filter("type == 'solar' AND region == 'DE'") \
    .groupBy('type', 'region', 'year', 'month') \
    .agg( \
        max("value").alias("max_generation_MW"),
        sum(col("value")/(4*10**3)).alias("sum_generation_GWh") # divide by 4*10^3 because we have 15 min MW values
        ) \
    .withColumn('sum_generation_GWh', round('sum_generation_GWh', 0)) \
    .sort(desc('sum_generation_GWh'))

In [16]:
#ct_agg.explain()

In [9]:
%time ct_agg.show()

+-----+------+----+-----+-----------------+------------------+
| type|region|year|month|max_generation_MW|sum_generation_GWh|
+-----+------+----+-----+-----------------+------------------+
|solar|    DE|2013|    7|          23998.0|            5129.0|
|solar|    DE|2016|    7|          25688.0|            4943.0|
|solar|    DE|2015|    7|          24731.0|            4918.0|
|solar|    DE|2014|    6|          24244.0|            4834.0|
|solar|    DE|2016|    6|          26201.0|            4767.0|
|solar|    DE|2016|    8|          25371.0|            4720.0|
|solar|    DE|2016|    5|          26252.0|            4717.0|
|solar|    DE|2015|    8|          24429.0|            4613.0|
|solar|    DE|2015|    6|          24847.0|            4553.0|
|solar|    DE|2015|    4|          25928.0|            4435.0|
|solar|    DE|2014|    7|          23624.0|            4417.0|
|solar|    DE|2015|    5|          22453.0|            4412.0|
|solar|    DE|2013|    6|          23203.0|            

In [10]:
ct_station = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="weather_station", keyspace="energydata")\
.load()

ct_sensor = spark.read\
.format("org.apache.spark.sql.cassandra")\
.options(table="weather_sensor", keyspace="energydata")\
.load()

In [11]:
station_subset = ct_station\
.filter("lat < 50 and lon > 10") 

sensor_subset = ct_sensor.filter("sensor=='h2'")

In [12]:
sensor_subset.count()

2248704

In [13]:
station_subset.count()

40

Joining with the Datasets API is limited, joins are not pushed down

You would need to go with the RDD API (Scala only) using rdd.joinWithCassandraTable()

In [14]:
from pyspark.sql.functions import *

%time station_subset.join(sensor_subset, station_subset.id == sensor_subset.id).count()

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 29.9 s


351360

In [None]:
station_subset.join(sensor_subset, station_subset.id == sensor_subset.id).explain()