In [0]:
import json

from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

####Join Static data with Streaming Data

In [0]:
# create a schema for static data
static_schema = StructType([StructField("name", StringType(), True),
                            StructField("year", IntegerType(), True),
                            StructField("director", StringType(), True),
                            StructField("writer", StringType(), True),
                            StructField("star", StringType(), True)
                           ])

In [0]:
# read static data from local DBFS into spark dataframe. This will be our Static data source for joining
static_data = spark.read\
                   .format("csv")\
                   .option("header", "true")\
                   .schema(static_schema )\
                   .load("dbfs:/FileStore/datasets/movies/")

static_data.display()

name,year,director,writer,star
The Shining,1980,Stanley Kubrick,Stephen King,Jack Nicholson
The Blue Lagoon,1980,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields
Star Wars: Episode V - The Empire Strikes Back,1980,Irvin Kershner,Leigh Brackett,Mark Hamill
Airplane!,1980,Jim Abrahams,Jim Abrahams,Robert Hays
Caddyshack,1980,Harold Ramis,Brian Doyle-Murray,Chevy Chase
Friday the 13th,1980,Sean S. Cunningham,Victor Miller,Betsy Palmer
The Blues Brothers,1980,John Landis,Dan Aykroyd,John Belushi
Raging Bull,1980,Martin Scorsese,Jake LaMotta,Robert De Niro
Superman II,1980,Richard Lester,Jerry Siegel,Gene Hackman


In [0]:
# connect to Event Hub as a source for streaming data

primaryKey = "Endpoint=sb://db-eventhub-namespace.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=86ATi9XElcqYcspgAo8KNdXzMiWsSJbj++AEhA35NY8="
entityPath = "EntityPath=eventhub-ratings"

connectionString = primaryKey + ";" + entityPath

ehConf = {}

#this configurations will read data from the beginning of the source

startOffset = "-1"

startingEventPosition = {
  "offset": startOffset,  
  "seqNo": -1,            
  "enqueuedTime": None,  
  "isInclusive": True
}

ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
ehConf["eventhubs.startingPosition"] = json.dumps(startingEventPosition)

In [0]:
# create schema for reading data
streaming_schema = StructType([StructField("name", StringType(), True),
                               StructField("rating", StringType(), True),
                               StructField("score", FloatType(), True)])

In [0]:
# Before running this step run the source notebook ("RatingsSource") that will push data to the Event hub
# read stream data from Event Hub (comming in json format)
streaming_data = spark.readStream \
                      .format("eventhubs") \
                      .options(**ehConf) \
                      .load()

In [0]:
streaming_data = streaming_data.selectExpr("CAST(body AS STRING)")

streaming_data.display()

body
"{ ""name"": ""Star Wars: Episode V - The Empire Strikes Back"", ""rating"": ""PG"", ""score"": 8.2}"
"{ ""name"": ""Fame"", ""rating"": ""R"", ""score"": 6.6}"
"{ ""name"": ""Friday the 13th"", ""rating"": ""R"", ""score"": 6.1}"
"{ ""name"": ""Lagaan"", ""rating"": ""PG"", ""score"": 9.2 }"
"{ ""name"": ""Andhadhun"", ""rating"": ""PG"", ""score"": 8.4 }"
"{ ""name"": ""Airplane!"",""rating"": ""PG"", ""score"": 7.7 }"
"{ ""name"": ""Caddyshack"", ""rating"": ""R"", ""score"": 7.3 }"
"{ ""name"": ""Friday the 13th"", ""rating"": ""R"", ""score"": 6.4}"
"{ ""name"": ""The Blues Brothers"", ""rating"": ""R"", ""score"": 7.9}"
"{ ""name"": ""Raging Bull"", ""rating"": ""R"", ""score"": 8.2 }"


In [0]:
# extract column data from json, using defined schema
streaming_data = streaming_data.select(from_json(col("body").cast("string"), streaming_schema)) \
                          .withColumnRenamed("from_json(CAST(body AS STRING))", "data") \
                          .select(col('data.*'))

streaming_data.display()

name,rating,score
Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Fame,R,6.6
Friday the 13th,R,6.1
Lagaan,PG,9.2
Andhadhun,PG,8.4
Airplane!,PG,7.7
Caddyshack,R,7.3
Friday the 13th,R,6.4
The Blues Brothers,R,7.9
Raging Bull,R,8.2


In [0]:
# Outer Join - all elements of the batch with all elements of the stream (without specifying column to join on)
outer_join = static_data.join(streaming_data)

outer_join.display()

name,year,director,writer,star,name.1,rating,score
The Shining,1980,Stanley Kubrick,Stephen King,Jack Nicholson,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
The Blue Lagoon,1980,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Star Wars: Episode V - The Empire Strikes Back,1980,Irvin Kershner,Leigh Brackett,Mark Hamill,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Airplane!,1980,Jim Abrahams,Jim Abrahams,Robert Hays,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Caddyshack,1980,Harold Ramis,Brian Doyle-Murray,Chevy Chase,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Friday the 13th,1980,Sean S. Cunningham,Victor Miller,Betsy Palmer,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
The Blues Brothers,1980,John Landis,Dan Aykroyd,John Belushi,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Raging Bull,1980,Martin Scorsese,Jake LaMotta,Robert De Niro,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Superman II,1980,Richard Lester,Jerry Siegel,Gene Hackman,Star Wars: Episode V - The Empire Strikes Back,PG,8.2
The Shining,1980,Stanley Kubrick,Stephen King,Jack Nicholson,Fame,R,6.6


In [0]:
# right outer join (streaming data on the right)
right_outer_join = static_data.join(streaming_data, on=["name"], how="right_outer")

right_outer_join.display()

name,year,director,writer,star,rating,score
Star Wars: Episode V - The Empire Strikes Back,1980.0,Irvin Kershner,Leigh Brackett,Mark Hamill,PG,8.2
Fame,,,,,R,6.6
Friday the 13th,1980.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,R,6.1
Lagaan,,,,,PG,9.2
Andhadhun,,,,,PG,8.4
Airplane!,1980.0,Jim Abrahams,Jim Abrahams,Robert Hays,PG,7.7
Caddyshack,1980.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,R,7.3
Friday the 13th,1980.0,Sean S. Cunningham,Victor Miller,Betsy Palmer,R,6.4
The Blues Brothers,1980.0,John Landis,Dan Aykroyd,John Belushi,R,7.9
Raging Bull,1980.0,Martin Scorsese,Jake LaMotta,Robert De Niro,R,8.2


In [0]:
# left outer join isn't possible without watermark when stream data are on the right (spark  must have explicit set of records) 
left_outer_join_rightstream = static_data.join(streaming_data, on=["name"], how="left_outer")

left_outer_join_rightstream.display()

In [0]:
# basicly, the same as above, just mirrorred. Returns error also
right_outer_join_leftstream = streaming_data.join(static_data, on=["name"], how="right_outer")

right_outer_join_leftstream.display()

In [0]:
# left outer join with streaming data on the left
left_outer_join = streaming_data.join(static_data, on=["name"], how="left_outer")

left_outer_join.display()

name,rating,score,year,director,writer,star
Star Wars: Episode V - The Empire Strikes Back,PG,8.2,1980.0,Irvin Kershner,Leigh Brackett,Mark Hamill
Fame,R,6.6,,,,
Friday the 13th,R,6.1,1980.0,Sean S. Cunningham,Victor Miller,Betsy Palmer
Lagaan,PG,9.2,,,,
Andhadhun,PG,8.4,,,,
Airplane!,PG,7.7,1980.0,Jim Abrahams,Jim Abrahams,Robert Hays
Caddyshack,R,7.3,1980.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase
Friday the 13th,R,6.4,1980.0,Sean S. Cunningham,Victor Miller,Betsy Palmer
The Blues Brothers,R,7.9,1980.0,John Landis,Dan Aykroyd,John Belushi
Raging Bull,R,8.2,1980.0,Martin Scorsese,Jake LaMotta,Robert De Niro


In [0]:
# inner join
inner_join = static_data.join(streaming_data, on=["name"], how="inner")

inner_join.display()

name,year,director,writer,star,rating,score
Star Wars: Episode V - The Empire Strikes Back,1980,Irvin Kershner,Leigh Brackett,Mark Hamill,PG,8.2
Friday the 13th,1980,Sean S. Cunningham,Victor Miller,Betsy Palmer,R,6.1
Airplane!,1980,Jim Abrahams,Jim Abrahams,Robert Hays,PG,7.7
Caddyshack,1980,Harold Ramis,Brian Doyle-Murray,Chevy Chase,R,7.3
Friday the 13th,1980,Sean S. Cunningham,Victor Miller,Betsy Palmer,R,6.4
The Blues Brothers,1980,John Landis,Dan Aykroyd,John Belushi,R,7.9
Raging Bull,1980,Martin Scorsese,Jake LaMotta,Robert De Niro,R,8.2
Star Wars: Episode V - The Empire Strikes Back,1980,Irvin Kershner,Leigh Brackett,Mark Hamill,PG,8.2
Friday the 13th,1980,Sean S. Cunningham,Victor Miller,Betsy Palmer,R,6.1
Star Wars: Episode V - The Empire Strikes Back,1980,Irvin Kershner,Leigh Brackett,Mark Hamill,PG,8.2


In [0]:
# combine joins with other operations
selected_join = static_data.join(streaming_data, on=["name"], how="inner")\
                           .select("name", "director", "star", "score") 

selected_join.display()

name,director,star,score
Star Wars: Episode V - The Empire Strikes Back,Irvin Kershner,Mark Hamill,8.2
Friday the 13th,Sean S. Cunningham,Betsy Palmer,6.1
Airplane!,Jim Abrahams,Robert Hays,7.7
Caddyshack,Harold Ramis,Chevy Chase,7.3
Friday the 13th,Sean S. Cunningham,Betsy Palmer,6.4
The Blues Brothers,John Landis,John Belushi,7.9
Raging Bull,Martin Scorsese,Robert De Niro,8.2
Star Wars: Episode V - The Empire Strikes Back,Irvin Kershner,Mark Hamill,8.2
Friday the 13th,Sean S. Cunningham,Betsy Palmer,6.1
Star Wars: Episode V - The Empire Strikes Back,Irvin Kershner,Mark Hamill,8.2


In [0]:
top_scorers = selected_join.select("name", "score").where(selected_join.score > 8)

top_scorers.display()

name,score
Star Wars: Episode V - The Empire Strikes Back,8.2
Raging Bull,8.2
Star Wars: Episode V - The Empire Strikes Back,8.2
Star Wars: Episode V - The Empire Strikes Back,8.2


In [0]:
# try to perform inner join on column that present only in the streaming source. It will result into en error
inner_join_error = static_data.join(streaming_data, on=["star"], how="inner")

inner_join_error.display()