In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_3-CSV-Complete-Mode") \
    .config("spark.executor.memory", "512mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

21/10/06 08:13:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
! ls -lh /opt/spark-data/datasets/droplocation


total 2.8M
-rwxrwxrwx 1 root root 1.5K Sep 20 18:13 sample.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xad.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xae.csv
-rwxrwxrwx 1 root root  69K May 15  2018 xag.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xah.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xai.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaj.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xak.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xam.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xao.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xap.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xaq.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xar.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xas.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xat.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xau.csv
-rwxrwxrwx 1 root root  67K May 15  2018 xav.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xax.csv
-rwxr-xr-x 1 root root  69K Oct  6 05:50 xay.csv
-rwxrwxrwx 1 root root  68K May 15 

In [5]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [6]:
fileStreamDF = spark.readStream\
                               .option("header", "true")\
                               .schema(schema)\
                               .csv("/opt/spark-data/datasets/droplocation")

In [7]:
# Check whether input data is streaming or not
print(" ")
print("Is the stream ready?")
print(fileStreamDF.isStreaming)


 
Is the stream ready?
True


In [8]:
# Print Schema
print(" ")
print("Schema of the input stream: ")
fileStreamDF.printSchema()


 
Schema of the input stream: 
root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [9]:
    # Create a trimmed version of the input dataframe with specific columns
    # We cannot sort a DataFrame unless aggregate is used, so no sorting here
    # We group by the borough and count the number of records (NOT number of convictions)
    # We have used an aggregation function (orderBy), so can sort the dataframe
recordsPerBorough = fileStreamDF.groupBy("borough")\
                             .count()\
                             .orderBy("count", ascending=False)


In [10]:
# We run in complete mode, so only new rows are processed,
# and existing rows in Result Table are not affected
# The output is written to the console
# We set truncate to false. If true, the output is truncated to 20 chars
# Explicity state number of rows to display. Default is 20  
query = recordsPerBorough.writeStream\
                      .outputMode("complete")\
                      .format("console")\
                      .option("truncate", "false")\
                      .option("numRows", 30)\
                      .start()\
                      .awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----------------------+-----+
|borough               |count|
+----------------------+-----+
|Croydon               |1829 |
|Barnet                |1728 |
|Ealing                |1678 |
|Enfield               |1596 |
|Lambeth               |1584 |
|Brent                 |1539 |
|Bromley               |1536 |
|Southwark             |1514 |
|Wandsworth            |1507 |
|Lewisham              |1445 |
|Newham                |1389 |
|Redbridge             |1317 |
|Waltham Forest        |1301 |
|Greenwich             |1294 |
|Hackney               |1292 |
|Hillingdon            |1265 |
|Havering              |1230 |
|Haringey              |1222 |
|Tower Hamlets         |1216 |
|Camden                |1200 |
|Bexley                |1152 |
|Westminster           |1143 |
|Harrow                |1138 |
|Hounslow              |1133 |
|Merton                |1107 |
|Islington             |1027 |
|Bar

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----------------------+-----+
|borough               |count|
+----------------------+-----+
|Croydon               |1878 |
|Barnet                |1771 |
|Ealing                |1713 |
|Enfield               |1629 |
|Lambeth               |1625 |
|Bromley               |1576 |
|Brent                 |1570 |
|Southwark             |1551 |
|Wandsworth            |1550 |
|Lewisham              |1484 |
|Newham                |1429 |
|Redbridge             |1349 |
|Waltham Forest        |1334 |
|Greenwich             |1325 |
|Hackney               |1314 |
|Hillingdon            |1292 |
|Havering              |1270 |
|Haringey              |1249 |
|Tower Hamlets         |1245 |
|Camden                |1227 |
|Westminster           |1179 |
|Bexley                |1174 |
|Harrow                |1166 |
|Hounslow              |1155 |
|Merton                |1136 |
|Islington             |1054 |
|Ham

KeyboardInterrupt: 

In [None]:
# Submit App :
# Submit codes/demo2.py 


In [11]:
spark.stop()