# Operations on Streaming Dataframes/Datasets Demo

### Demo

In [1]:
import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.3.0-bin-hadoop2.7')

In [2]:
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, current_timestamp
from pyspark.sql.functions import split
from pyspark.sql.functions import window
from pyspark.sql.types import StructType, TimestampType

In [3]:
staging_dir = 'monitoring_data'

In [4]:
spark = SparkSession\
            .builder\
            .appName("UserInteractionAnalyzer")\
            .getOrCreate()

In [5]:
# Read all the csv files written atomically in a directory
# The schema is as follows:
# userA, userB, timestamp, interaction
userSchema = StructType()\
              .add("userA", "integer")\
              .add("userB", "integer")\
              .add("timestamp", TimestampType())\
              .add("interaction", "string")

In [6]:
# Create DataFrame representing the stream of input lines from connection to localhost:9999
activity = spark\
             .readStream\
             .option("sep", ",")\
             .schema(userSchema)\
             .csv(staging_dir+"/*.csv")

In [7]:
wordCounts = activity\
                .select("userB")\
                .where("interaction = \"MT\"")

In [None]:
query = wordCounts\
          .writeStream.trigger(processingTime='10 seconds')\
          .format("parquet")\
          .option("checkpointLocation", "applicationHistory") \
          .option("path",staging_dir+"/out")\
          .start()
        
query2 = wordCounts\
          .writeStream\
          .trigger(processingTime='10 seconds')\
          .format("console")\
          .start()

query.awaitTermination()
# spark.sql("select * from aggregates").show()   # interactively query in-memory table