In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_5-SQL-Query") \
    .config("spark.executor.memory", "500mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

21/10/06 09:09:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
! ls -lh /opt/spark-data/datasets/droplocation


total 1.4M
-rwxrwxrwx 1 root root 1.5K Sep 20 18:13 sample.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xad.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xae.csv
-rwxrwxrwx 1 root root  69K May 15  2018 xag.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xah.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xai.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaj.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xak.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xam.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xao.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xap.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaq.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xar.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xas.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xat.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xau.csv
-rwxrwxrwx 1 root root  67K May 15  2018 xav.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xax.csv
-rwxr-xr-x 1 root root  69K Oct  6 05:50 xay.csv
-rwxrwxrwx 1 root root  68K May 15 

In [5]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [6]:
fileStreamDF = spark.readStream\
                               .option("header", "true")\
                               .schema(schema)\
                               .option("maxFilesPerTrigger", 1)\
                               .csv("/opt/spark-data/datasets/droplocation")

In [7]:
# Registering Table
# Create a view which can later be queried like a table
fileStreamDF.createOrReplaceTempView("LondonCrimeData")

In [8]:
categoryDF = spark.sql("SELECT major_category, value \
                                    FROM LondonCrimeData \
                                    WHERE year = '2016'")

In [9]:
# Use groupBy and agg functions to get total convictions per major_category
# The new column created will be called sum(value) - rename to something meaningful
# Order by number of convictions in descending order
convictionsPerCategory = categoryDF.groupBy("major_category")\
                                      .agg({"value": "sum"})\
                                      .withColumnRenamed("sum(value)", "convictions")\
                                      .orderBy("convictions", ascending=False)


In [None]:
# Write out our dataframe to the console
query = convictionsPerCategory.writeStream\
                      .outputMode("complete")\
                      .format("console")\
                      .option("truncate", "false")\
                      .option("numRows", 30)\
                      .start()\
                      .awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------------------------+-----------+
|major_category             |convictions|
+---------------------------+-----------+
|Violence Against the Person|1.0        |
|Theft and Handling         |0.0        |
+---------------------------+-----------+



In [None]:
# Submit App :
# Submit codes/demo2.py 


In [None]:
spark.stop()