In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
import datetime

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_7-Windows-Proccessing") \
    .config("spark.executor.memory", "500mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "2") \
    .getOrCreate()

21/10/06 09:19:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/06 09:19:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
! ls -lh /opt/spark-data/datasets/droplocation


total 1.4M
-rwxrwxrwx 1 root root 1.5K Sep 20 18:13 sample.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xad.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xae.csv
-rwxrwxrwx 1 root root  69K May 15  2018 xag.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xah.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xai.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaj.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xak.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xam.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xao.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xap.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaq.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xar.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xas.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xat.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xau.csv
-rwxrwxrwx 1 root root  67K May 15  2018 xav.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xax.csv
-rwxr-xr-x 1 root root  69K Oct  6 05:50 xay.csv
-rwxrwxrwx 1 root root  68K May 15 

In [5]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [6]:
fileStreamDF = spark.readStream\
                               .option("header", "true")\
                               .schema(schema)\
                               .option("maxFilesPerTrigger", 2)\
                               .csv("/opt/spark-data/datasets/droplocation")

In [7]:
# The User Defined Function (UDF)
# Create a timestamp from the current time and return it
def add_timestamp():
         ts = time.time()
         timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
         return timestamp

In [8]:
# Register the UDF
# Set the return type to be a String
# A name is assigned to the registered function 
add_timestamp_udf = udf(add_timestamp, StringType())

In [9]:
# Create a new column called "timestamp" in fileStreamDF
# Apply the UDF to every row in fileStreamDF - assign its return value to timestamp column
fileStreamWithTS = fileStreamDF.withColumn("timestamp", add_timestamp_udf())

In [10]:

# window(timeColumn, windowDuration, slideDuration=None, startTime=None)
# timeColumn gives the time field to use when creating a window
# windowDuration gives the length of the window
# slideDuration is the gap between each window (Windows can overlap)
# slideDuration must be <= windowDuration
# The #convictions for a particular window will likely increase with each batch of files processed - 
# this is because more timestamps within that window will be encountered in the new batch
windowedCounts = fileStreamWithTS.groupBy(
                                    window(fileStreamWithTS.timestamp, 
                                            "10 minutes", 
                                            "5 minutes"))\
                                  .agg({"value": "sum"})\
                                  .withColumnRenamed("sum(value)", "convictions")\
                                  .orderBy('convictions', ascending=False)


# Write output to the console
query = windowedCounts.writeStream\
                      .outputMode("complete")\
                      .format("console")\
                      .option("truncate","false")\
                      .start()\
                      .awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|899.0      |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|899.0      |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|1925.0     |
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|1925.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|2984.0     |
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|2984.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|3942.0     |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|3942.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|4985.0     |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|4985.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|5890.0     |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|5890.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|6969.0     |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|6969.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|7931.0     |
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|7931.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|8790.0     |
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|8790.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|9306.0     |
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|9306.0     |
+------------------------------------------+-----------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+-----------+
|window                                    |convictions|
+------------------------------------------+-----------+
|{2021-10-06 09:20:00, 2021-10-06 09:30:00}|9837.0     |
|{2021-10-06 09:15:00, 2021-10-06 09:25:00}|9837.0     |
+------------------------------------------+-----------+



KeyboardInterrupt: 

In [None]:
# Submit App :
# Submit codes/demo2.py 


In [None]:
spark.stop()