In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_4-Aggregate-Sum") \
    .config("spark.executor.memory", "512mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

21/10/06 09:01:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
! ls -lh /opt/spark-data/datasets/droplocation


total 4.0K
-rwxrwxrwx 1 root root 1.5K Sep 20 18:13 sample.csv


In [5]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [6]:
fileStreamDF = spark.readStream\
                               .option("header", "true")\
                               .schema(schema)\
                               .option("maxFilesPerTrigger", 1)\
                               .csv("/opt/spark-data/datasets/droplocation")

In [7]:
# Use groupBy and agg functions to get total convictions per borough
# The new column created will be called sum(value) - rename to something meaningful
# Order by number of convictions in descending order
convictionsPerBorough = fileStreamDF.groupBy("major_category")\
                                      .agg(*[min(col("value")),max(col("value")),sum(col("value"))])\
                                      .withColumnRenamed("sum(value)", "convictions")\
                                      .withColumnRenamed("min(value)", "min-convictions")\
                                      .withColumnRenamed("max(value)", "max-convictions")\
                                      .orderBy("convictions", ascending=False)

In [None]:
query = convictionsPerBorough.writeStream\
                      .outputMode("complete")\
                      .format("console")\
                      .option("truncate", "false")\
                      .option("numRows", 30)\
                      .start()\
                      .awaitTermination()


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------------------------+---------------+---------------+-----------+
|major_category             |min-convictions|max-convictions|convictions|
+---------------------------+---------------+---------------+-----------+
|Theft and Handling         |0              |14             |16.0       |
|Violence Against the Person|0              |1              |3.0        |
|Drugs                      |1              |1              |1.0        |
|Other Notifiable Offences  |1              |1              |1.0        |
|Criminal Damage            |0              |1              |1.0        |
|Robbery                    |0              |0              |0.0        |
|Burglary                   |0              |0              |0.0        |
+---------------------------+---------------+---------------+-----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------------------------+---------------+---------------+-----------+
|major_category             |min-convictions|max-convictions|convictions|
+---------------------------+---------------+---------------+-----------+
|Theft and Handling         |0              |67             |295.0      |
|Violence Against the Person|0              |5              |101.0      |
|Burglary                   |0              |6              |55.0       |
|Criminal Damage            |0              |3              |41.0       |
|Robbery                    |0              |5              |29.0       |
|Drugs                      |0              |8              |26.0       |
|Other Notifiable Offences  |0              |2              |6.0        |
|Fraud or Forgery           |0              |0              |0.0        |
|Sexual Offences            |0              |0              |0.0        |
+--------------

In [None]:
# Submit App :
# Submit codes/demo2.py 


In [None]:
spark.stop()