In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

In [0]:
schema = StructType([StructField("total_bill", FloatType(), False), \
                     StructField("tip", FloatType(), False), \
                     StructField("sex", StringType(), False), \
                     StructField("smoker", StringType(), False), \
                     StructField("day", StringType(), False), \
                     StructField("time", StringType(), False), \
                     StructField("size", IntegerType(), False)])

In [0]:
# create a source directory for streaming data
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/tips_data_source_stream")

In [0]:
# read streaming data
tips_data_source_stream = spark.readStream \
                               .format("csv") \
                               .option("header", True) \
                               .schema(schema) \
                               .load("dbfs:/FileStore/datasets/tips_data_source_stream")

In [0]:
print("Is this streaming data? ", tips_data_source_stream.isStreaming)

In [0]:
#upload new data into the source folder (is empty now)
tips_data_source_stream.display()

total_bill,tip,sex,smoker,day,time,size
16.0,2.0,Male,Yes,Thur,Lunch,2
21.01,3.0,Male,Yes,Fri,Dinner,2
30.14,3.09,Female,Yes,Sat,Dinner,4
17.81,2.34,Male,No,Sat,Dinner,4
14.07,2.5,Male,No,Sun,Dinner,2
7.74,1.44,Male,Yes,Sat,Dinner,2
13.94,3.06,Male,No,Sun,Dinner,2
32.83,1.17,Male,Yes,Sat,Dinner,2
25.89,5.16,Male,Yes,Sat,Dinner,4
48.33,9.0,Male,No,Sat,Dinner,4


In [0]:
#try to run queries almost simultaniously
tips_data_source_stream.groupby('day').agg({'tip': 'avg'}).display()

day,avg(tip)
Thur,2.7714516078272173
Sun,3.258441553487406
Sat,2.9783333367771574
Fri,2.734736856661345


In [0]:
tips_data_source_stream.groupby('time').agg({'tip': 'avg'}).display()

time,avg(tip)
Lunch,2.728088231647716
Dinner,3.09572222299046


Now we'll upload an XML file containing configurations for our cluster, specifying 2 pools, with gollowing content:

<?xml version="1.0"?>
<allocations>
  <pool name="devPool">
    <schedulingMode>FAIR</schedulingMode>
    <weight>1</weight>
    <minShare>2</minShare>
  </pool>
  <pool name="prodPool">
    <schedulingMode>FAIR</schedulingMode>
    <weight>1</weight>
    <minShare>2</minShare>
  </pool>
</allocations>

In [0]:
# configure spark to read from configuration file. setLocalProperty will affect only those jobs submitted from the same thread
spark.sparkContext.setLocalProperty("spark.scheduler.allocation.file", "dbfs:/FileStore/fairscheduler.xml")

In [0]:
# configure the next job to use devPool
spark.sparkContext.setLocalProperty("spark.scheduler.pool", "devPool")

tips_data_source_stream.groupby('day').agg({'tip': 'avg'}).display()

day,avg(tip)
Thur,2.7714516078272173
Sun,3.258441553487406
Sat,2.9783333367771574
Fri,2.734736856661345


In [0]:
#note that you have to specify setLocalProperty for each cell separately
spark.sparkContext.setLocalProperty("spark.scheduler.pool", "prodPool")

tips_data_source_stream.groupby('time').agg({'tip': 'avg'}).display()

time,avg(tip)
Lunch,2.728088231647716
Dinner,3.09572222299046
