Cuarto ejercicio de Spark Streaming usando el API DStream.
WordCount con ventana deslizante usando CountByValueAndWindow

In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
# Function to create and setup a new StreamingContext
def functionToCreateContext():
    # Create a local StreamingContext with two working thread and batch interval of 5 seconds
    sc = SparkContext("local[2]", "WindowedNetworkWordCount2")
    ssc = StreamingContext(sc, 5)
    
    # Mandatory set a checkpoint dir
    # http://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing
    # Crear carpeta /checkpointDirectory2 dentro del directorio notebooks-spark
    ssc.checkpoint("./checkpointDirectory2")  # set checkpoint directory
    return ssc

In [3]:
# Get StreamingContext from checkpoint data or create a new one
ssc = StreamingContext.getOrCreate(checkpointPath = "./checkpointDirectory2", setupFunc = functionToCreateContext)

In [4]:
# Create a DStream that will connect to hostname:port, like localhost:9999
lines =ssc.socketTextStream("localhost", 9999)

In [5]:
# Split each line into words
words =lines.flatMap(lambda line:line.split(" "))

In [6]:
# Count each word in each batch
windowedWordCounts = words.countByValueAndWindow(30, 10)

In [7]:
# Print the first ten elements of each RDD generated in this DStream to the console
windowedWordCounts.pprint()

ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate

-------------------------------------------
Time: 2018-12-12 01:05:50
-------------------------------------------

-------------------------------------------
Time: 2018-12-12 01:06:00
-------------------------------------------

-------------------------------------------
Time: 2018-12-12 01:06:10
-------------------------------------------

-------------------------------------------
Time: 2018-12-12 01:06:20
-------------------------------------------
('Hola1', 1)
('Hola3', 1)
('Hola4', 1)
('Hola2', 1)

-------------------------------------------
Time: 2018-12-12 01:06:30
-------------------------------------------
('Hola1', 1)
('Hola3', 1)
('Hola4', 1)
('Hola6', 1)
('Hola7', 1)
('Hola9', 1)
('Hola10', 1)
('Hola2', 1)
('Hola5', 1)
('Hola8', 1)

-------------------------------------------
Time: 2018-12-12 01:06:40
-------------------------------------------
('Hola1', 1)
('Hola3', 1)
('Hola4', 1)
('Hola6', 1)
('Hola7', 1)
('Hola9', 1)
('Hola10', 4)
('Hola2', 1)
('Hola5', 1)
('Hola8', 

KeyboardInterrupt: 