In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *



In [2]:
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Step1_2-CSV-Append-Mode") \
    .config("spark.executor.memory", "512mb") \
    .config("spark.executor.cores","1") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

21/10/06 07:56:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("ERROR")


In [4]:
! ls -lh /opt/spark-data/datasets/droplocation


total 72K
-rwxrwxrwx 1 root root 1.5K Sep 20 18:13 sample.csv
-rwxrwxrwx 1 root root  68K May 15  2018 xad.csv


In [5]:
schema = StructType([StructField("lsoa_code", StringType(), True),\
                         StructField("borough", StringType(), True),\
                         StructField("major_category", StringType(), True),\
                         StructField("minor_category", StringType(), True),\
                         StructField("value", StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField("month", StringType(), True)])


In [6]:
fileStreamDF = spark.readStream\
                               .option("header", "false")\
                               .schema(schema)\
                               .option("maxFilesPerTrigger", 1)\
                               .csv("/opt/spark-data/datasets/droplocation/")

In [7]:
# Check whether input data is streaming or not
print(" ")
print("Is the stream ready?")
print(fileStreamDF.isStreaming)


 
Is the stream ready?
True


In [8]:
# Print Schema
print(" ")
print("Schema of the input stream: ")
fileStreamDF.printSchema()


 
Schema of the input stream: 
root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [9]:
    # Create a trimmed version of the input dataframe with specific columns
    # We cannot sort a DataFrame unless aggregate is used, so no sorting here
trimmedDF = fileStreamDF.select(
                                      fileStreamDF.borough,
                                      fileStreamDF.year,
                                      fileStreamDF.month,
                                      fileStreamDF.value
                                      )\
                             .withColumnRenamed(
                                      "value",
                                      "convictions"
                                      )


In [10]:
query = trimmedDF.writeStream\
                      .outputMode("append")\
                      .format("console")\
                      .option("truncate", "false")\
                      .option("numRows", 30)\
                      .start()\
                      .awaitTermination()


[Stage 0:>                                                          (0 + 1) / 1]

-------------------------------------------
Batch: 0
-------------------------------------------


                                                                                

+----------------------+----+-----+-----------+
|borough               |year|month|convictions|
+----------------------+----+-----+-----------+
|Hounslow              |2014|4    |2          |
|Lambeth               |2009|8    |6          |
|Lewisham              |2011|7    |1          |
|Waltham Forest        |2014|12   |0          |
|Brent                 |2008|4    |0          |
|Hammersmith and Fulham|2012|1    |0          |
|Lewisham              |2009|9    |0          |
|Tower Hamlets         |2012|4    |0          |
|Southwark             |2009|11   |3          |
|Wandsworth            |2009|5    |0          |
|Lewisham              |2015|8    |1          |
|Barnet                |2011|2    |2          |
|Wandsworth            |2015|8    |3          |
|Croydon               |2014|7    |0          |
|Sutton                |2009|11   |0          |
|Hillingdon            |2011|2    |0          |
|Bexley                |2011|1    |0          |
|Lambeth               |2013|3    |0    

KeyboardInterrupt: 

In [None]:
# Submit App :
# go to pyspark container
# cd /opt/spark-apps/
# unset PYSPARK_DRIVER_PYTHON
# spark-submit --master spark://spark-master:7077 csv-append-mode.py
# export PYSPARK_DRIVER_PYTHON=python

In [11]:
spark.stop()

### Submit Sample Spark App in Pyspark Container Bash 

- Go to Pyspark Shell :
```bash
docker exec -it pyspark bash
```
- cd /opt/spark-app/
- run this command :
```bash
unset PYSPARK_DRIVER_PYTHON
spark-submit --master spark://spark-master:7077 csv-append-mode.py
export PYSPARK_DRIVER_PYTHON=python
```