# Download Datasets

In [0]:
%sh 
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/el_quijote.txt'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/frankenstein.txt'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/characters.csv'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/species.csv'

In [0]:
dbutils.fs.rm("/dataset/streaming", True)
dbutils.fs.mkdirs("/dataset/streaming")
dbutils.fs.cp('file:/databricks/driver/el_quijote.txt','dbfs:/dataset/streaming/el_quijote.txt')
dbutils.fs.cp('file:/databricks/driver/frankenstein.txt','dbfs:/dataset/frankenstein.txt')
dbutils.fs.cp('file:/databricks/driver/characters.csv','dbfs:/dataset/characters.csv')
dbutils.fs.cp('file:/databricks/driver/species.csv','dbfs:/dataset/species.csv')

# Structured Streaming

---



## Example 1

Read a streaming folder

In [0]:
from pyspark.sql.functions import *

lines = spark \
  .readStream \
  .format("text") \
  .load("/dataset/streaming/")

words = lines.select(
    explode(split(col("value"), " ")).alias("word"),    
)

groupedWords = words \
  .groupBy("word") \
  .count() \
  .sort(col("count").desc())

query = groupedWords \
  .writeStream \
  .outputMode("complete") \
  .format("memory") \
  .queryName("testquijote")

query.start()



In [0]:
groupedWords.display()

In [0]:
spark.sql("select * from testquijote limit 10").show()

**Ahora copiamos un nuevo fichero en el directorio de streaming para ver que la tabla en memoria captura los cambios** 

In [0]:
dbutils.fs.ls('/dataset/streaming')

In [0]:
dbutils.fs.cp('/dataset/frankenstein.txt','/dataset/streaming/')
dbutils.fs.ls('/dataset/streaming')

In [0]:
%sql
select * from testquijote limit 100

## Exercise 1

Using the example 1 code, filter out all words less than 4 characters

---




## Example 2

Reading a CSV file, applying a schema

In [0]:
from pyspark.sql.types import *

schema = StructType([
    StructField("name", StringType(), True),
    StructField("height", StringType(), True),
    StructField("hair_color", StringType(), True),
    StructField("skin_color", StringType(), True),
    StructField("eye_color", StringType(), True),
    StructField("birth_year", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("homeworld", StringType(), True),
    StructField("species", StringType(), True)
])

In [0]:
lines = spark.readStream \
  .format("csv") \
  .schema(schema) \
  .load("/dataset/charac*.csv") \
  .withColumn("current_timestamp", current_timestamp())

In [0]:
# Start running the query that prints the running counts to the console
query = lines.writeStream \
  .outputMode("update") \
  .format("memory") \
  .queryName("charac") \

query.start()


In [0]:
spark.sql("select * from charac limit 10").display()

In [0]:
lines.display()

# Windowing

## Example 3

Read in streaming files `el_quijote.txt` and `frankestein.txt`. Applying a 5 seconds fix window

In [0]:
dbutils.fs.rm('/dataset/books', True)
dbutils.fs.mkdirs("/dataset/books")

In [0]:
dbutils.fs.cp('/dataset/streaming/el_quijote.txt', '/dataset/books/')
dbutils.fs.ls('/dataset/books/')

In [0]:
from pyspark.sql.functions import *
words = spark \
  .readStream \
  .format("text") \
  .load("/dataset/books/") \
  .select(explode(split(col("value"), " ")).alias("word"), col("current_timestamp")) \
  .withColumn("current_timestamp", current_timestamp())

In [0]:
from pyspark.sql.functions import *
windowedCounts = words \
  .groupBy(
      window(col("current_timestamp"), "5 seconds"),
      col("word")
  ) \
  .count() \
  .sort(col("count").desc())

query = windowedCounts \
  .writeStream \
  .outputMode("complete") \
  .format("memory") \
  .queryName("test_windowing_books")

query.start()

In [0]:
spark.sql("select * from test_windowing_books limit 10").display()

In [0]:
spark.sql("select window, count(*) from test_windowing_books group by window").show(20, False)

In [0]:
dbutils.fs.cp('/dataset/streaming/frankenstein.txt', '/dataset/books/')
dbutils.fs.ls('/dataset/books/')

In [0]:
spark.sql("select window, count(*) from test_windowing_books group by window").display()

## Exercise 2

Get the number of different species classifications from `species.csv`
- Split the file in multiple parts, create a new folder, inserting each part in the folder one by one
- Group the result in 2 seconds fix window

---



In [0]:
dbutils.fs.rm('/dataset/species_splitted', True)

In [0]:
from pyspark.sql.types import *

schema_species = StructType([
    StructField("name", StringType(), True),
    StructField("classification", StringType(), True),
    StructField("designation", StringType(), True),
    StructField("skin_colors", StringType(), True),
    StructField("hair_colors", StringType(), True),
    StructField("eye_colors", StringType(), True),
    StructField("average_lifespan", StringType(), True),
    StructField("language", StringType(), True),
    StructField("homeworld", StringType(), True)
])

# Stream - Stream Join

## Exercise 3

Update the following code to build an inner join between `df_left_modified` and `df_right_modified` Dataframes.
- Set 2 hours watermark for both streams

In [0]:
df_left = (
    spark
    .readStream
    .format("rate")
    .option("rowsPerSecond", 3)
    .load()
)

In [0]:
df_left.printSchema()

In [0]:
df_right = (
    spark
    .readStream
    .format("rate")
    .option("rowsPerSecond", 3)
    .load()
)

In [0]:
from pyspark.sql.functions import *

df_left_modified = (
    df_left
    .withColumn("left_key", ceil(rand() * 10))
    .withColumn("left_value", ceil(rand() * 10))
)

In [0]:
df_right_modified = (
    df_right
    .withColumn("right_key", ceil(rand() * 10))
    .withColumn("right_value", ceil(rand() * 10))
)