In [0]:
import json

from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, from_json

In [0]:
#connect to newly created Event Hub source1 (movies)
primaryKey_movies = "Endpoint=sb://db-eventhub-namespace.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=86ATi9XElcqYcspgAo8KNdXzMiWsSJbj++AEhA35NY8="
entityPath_movies = "EntityPath=eventhub-movies"

connectionString_movies = primaryKey_movies + ";" + entityPath_movies

ehConf_movies = {}

startingEventPosition_movies = {
  "offset": "-1",  
  "seqNo": -1,            
  "enqueuedTime": None,  
  "isInclusive": True
}

ehConf_movies['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString_movies)
ehConf_movies["eventhubs.startingPosition"] = json.dumps(startingEventPosition_movies)

In [0]:
#connect to newly created Event Hub source2 (ratings)
primaryKey_ratings = "Endpoint=sb://db-eventhub-namespace.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey=86ATi9XElcqYcspgAo8KNdXzMiWsSJbj++AEhA35NY8="
entityPath_ratings = "EntityPath=eventhub-ratings"

connectionString_ratings = primaryKey_ratings + ";" + entityPath_ratings

ehConf_ratings = {}

startingEventPosition_ratings = {
  "offset": "-1",  
  "seqNo": -1,            
  "enqueuedTime": None,  
  "isInclusive": True
}

ehConf_ratings['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString_ratings)
ehConf_ratings["eventhubs.startingPosition"] = json.dumps(startingEventPosition_ratings)

In [0]:
# configure the schema for movies data
streaming_schema_movies = StructType([StructField("name", StringType(), True),
                                      StructField("year", IntegerType(), True),
                                      StructField("director", StringType(), True),
                                      StructField("writer", StringType(), True),
                                      StructField("star", StringType(), True)
                                     ])

In [0]:
# read stream movies data 
streaming_data_movies = spark.readStream \
                             .format("eventhubs") \
                             .options(**ehConf_movies) \
                             .load()

In [0]:
# get columns from json
streaming_data_movies = streaming_data_movies.select(from_json(col("body").cast("string"), streaming_schema_movies)) \
                                             .withColumnRenamed("from_json(CAST(body AS STRING))", "data") \
                                             .select(col('data.*')) 

streaming_data_movies.display()

name,year,director,writer,star
The Long Riders,1980.0,Walter Hill,Bill Bryden,David Carradine
Any Which Way You Can,1980.0,Buddy Van Horn,Stanford Sherman,Clint Eastwood
The Gods Must Be Crazy,1980.0,Jamie Uys,Jamie Uys,N!xau
Popeye,1980.0,Robert Altman,Jules Feiffer,Robin Williams
Ordinary People,1980.0,Robert Redford,Judith Guest,Donald Sutherland
Amar Akbar Anthony,,,,
The Long Riders,,,,
The Gods Must Be Crazy,,,,
Ordinary People,,,,


In [0]:
# configure schema for ratings 
streaming_schema_ratings = StructType([StructField("name", StringType(), True),
                                       StructField("rating", StringType(), True),
                                       StructField("score", FloatType(), True)
                                      ])

In [0]:
# read stream movies data
streaming_data_ratings = spark.readStream \
                             .format("eventhubs") \
                             .options(**ehConf_ratings) \
                             .load()

In [0]:
streaming_data_ratings = streaming_data_ratings.select(from_json(col("body").cast("string"), streaming_schema_ratings)) \
                                               .withColumnRenamed("from_json(CAST(body AS STRING))", "data") \
                                               .select(col('data.*'))   

streaming_data_ratings.display()

name,rating,score
Lagaan,PG,9.2
Andhadhun,PG,8.4
Airplane!,PG,7.7
Caddyshack,R,7.3
Friday the 13th,R,6.4
The Blues Brothers,R,7.9
Raging Bull,R,8.2
Star Wars: Episode V - The Empire Strikes Back,PG,8.2
Fame,R,6.6
Friday the 13th,R,6.1


In [0]:
# inner join of two streams
inner_join = streaming_data_movies.join(streaming_data_ratings, on=["name"])

inner_join.display()

name,year,director,writer,star,rating,score
Ordinary People,1980.0,Robert Redford,Judith Guest,Donald Sutherland,PG,8.1
Ordinary People,,,,,PG,8.1
The Long Riders,1980.0,Walter Hill,Bill Bryden,David Carradine,PG,7.1
The Long Riders,,,,,PG,7.1
The Gods Must Be Crazy,1980.0,Jamie Uys,Jamie Uys,N!xau,PG,8.6
The Gods Must Be Crazy,,,,,PG,8.6
Amar Akbar Anthony,,,,,PG,9.3


In [0]:
# additional operations combined with joins
join_and_filter = streaming_data_ratings.join(streaming_data_movies, on=["name"]) \
                                .where(streaming_data_ratings.rating == "PG")

join_and_filter.display()

name,rating,score,year,director,writer,star
Ordinary People,PG,8.1,,,,
Ordinary People,PG,8.1,1980.0,Robert Redford,Judith Guest,Donald Sutherland
The Long Riders,PG,7.1,,,,
The Long Riders,PG,7.1,1980.0,Walter Hill,Bill Bryden,David Carradine
The Gods Must Be Crazy,PG,8.6,,,,
The Gods Must Be Crazy,PG,8.6,1980.0,Jamie Uys,Jamie Uys,N!xau
Amar Akbar Anthony,PG,9.3,,,,


In [0]:
# try to perform left outer join on two streams 
not_supported_join = streaming_data_movies.join(streaming_data_ratings, on=["name"], how="leftOuter")

not_supported_join.display()

NB: neither outer join is working on stream side, unless watermark is specified