In [1]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# *SPARK STREAMING*

# 1 aplikacja

Spark Streaming potrzebuje minium 2 rdzenie.

StreamingContext (tworzony na podstawie SparkContextu) reprezentuje połączenie z klastrem i służy do tworzenia DStreamów

In [2]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc,1)

In [3]:
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" "))

In [4]:
wordcounts = words.map(lambda word: (word,1)).reduceByKey(lambda x,y: x+y)

In [5]:
wordcounts.pprint()

Obliczenia zaczynają się dopiero od wywołania `start`

In [6]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:18:02
-------------------------------------------
('MOBY', 1)
('DICK;', 1)

-------------------------------------------
Time: 2019-01-25 17:18:03
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2019-01-25 17:18:04
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2019-01-25 17:18:05
-------------------------------------------
('WHALE.', 1)
('or,', 1)
('THE', 1)

-------------------------------------------
Time: 2019-01-25 17:18:06
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2019-01-25 17:18:07
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2019-01-25 17:18:08
-------------------------------------------
('1.', 1)
('CHAPTER', 1)
('Loomings.', 1)

-------------------------------------------
Time: 2019-01-25

-------------------------------------------
Time: 2019-01-25 17:18:41
-------------------------------------------
('are', 1)
('of', 1)
('week', 1)
('days', 1)
('pent', 1)
('in', 1)
('lath', 1)
('plaster—tied', 1)
('all', 1)
('landsmen;', 1)
...

-------------------------------------------
Time: 2019-01-25 17:18:42
-------------------------------------------
('counters,', 1)
('nailed', 1)
('benches,', 1)
('clinched', 1)
('is', 1)
('to', 2)
('desks.', 1)
('How', 1)
('then', 1)
('this?', 1)
...

-------------------------------------------
Time: 2019-01-25 17:18:43
-------------------------------------------
('green', 1)
('fields', 1)
('What', 1)
('do', 1)
('the', 1)
('gone?', 1)
('they', 1)
('here?', 1)

-------------------------------------------
Time: 2019-01-25 17:18:44
-------------------------------------------
('', 1)

-------------------------------------------
Time: 2019-01-25 17:18:45
-------------------------------------------
('But', 1)
('more', 1)
('crowds,', 1)
('pacing', 1)


> **TODO**: Popraw powyższą aplikację tak aby printowane były oczyszczone słowa

In [7]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc,1)

In [8]:
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())

In [9]:
wordcounts = words.map(lambda word: (word.lower().strip("?!,.-;:"),1)).reduceByKey(lambda x,y: x+y)

In [10]:
wordcounts.pprint()

In [11]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:19:22
-------------------------------------------
('moby', 1)
('dick', 1)

-------------------------------------------
Time: 2019-01-25 17:19:23
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:19:24
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:19:25
-------------------------------------------
('or', 1)
('the', 1)
('whale', 1)

-------------------------------------------
Time: 2019-01-25 17:19:26
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:19:27
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:19:28
-------------------------------------------
('chapter', 1)
('1', 1)
('loomings', 1)

-------------------------------------------
Time: 2019-01-25 17:19:29
---------------------------

-------------------------------------------
Time: 2019-01-25 17:20:01
-------------------------------------------
('are', 1)
('landsmen', 1)
('of', 1)
('week', 1)
('days', 1)
('pent', 1)
('in', 1)
('lath', 1)
('plaster', 1)
('tied', 1)
...

-------------------------------------------
Time: 2019-01-25 17:20:02
-------------------------------------------
('counters', 1)
('nailed', 1)
('benches', 1)
('clinched', 1)
('desks', 1)
('is', 1)
('this', 1)
('are', 1)
('to', 2)
('how', 1)
...

-------------------------------------------
Time: 2019-01-25 17:20:03
-------------------------------------------
('green', 1)
('fields', 1)
('do', 1)
('the', 1)
('gone', 1)
('what', 1)
('they', 1)
('here', 1)

-------------------------------------------
Time: 2019-01-25 17:20:04
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:20:05
-------------------------------------------
('but', 1)
('look', 1)
('more', 1)
('pacing', 1)
('water', 1)
('here', 1

# 2 aplikacja

Aby Spark Streaming mógł łączyć dane z wielu batchy (stateful transformations) konieczne jest wskazanie lokalizacji gdzie zapisywane będą checkpointy.

In [12]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)
ssc.checkpoint("tmp")

In [13]:
lines = ssc.socketTextStream("localhost", 9999)

In [14]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)

In [15]:
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))

In [16]:
runningCounts = pairs.updateStateByKey(updateFunc)
runningCounts.pprint()

In [17]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:21:18
-------------------------------------------
('MOBY', 1)
('DICK;', 1)

-------------------------------------------
Time: 2019-01-25 17:21:19
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 1)

-------------------------------------------
Time: 2019-01-25 17:21:20
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 2)

-------------------------------------------
Time: 2019-01-25 17:21:21
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 2)
('WHALE.', 1)
('or,', 1)
('THE', 1)

-------------------------------------------
Time: 2019-01-25 17:21:22
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 3)
('WHALE.', 1)
('or,', 1)
('THE', 1)

-------------------------------------------
Time: 2019-01-25 17:21:23
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 4)
('WHALE.', 1)
('or,', 1)
('THE', 1)

------------

-------------------------------------------
Time: 2019-01-25 17:21:55
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 7)
('WHALE.', 1)
('1.', 1)
('Call', 1)
('years', 1)
('mind', 1)
('long', 1)
('no', 1)
...

-------------------------------------------
Time: 2019-01-25 17:21:56
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 7)
('WHALE.', 1)
('1.', 1)
('Call', 1)
('years', 1)
('mind', 1)
('long', 1)
('no', 1)
...

-------------------------------------------
Time: 2019-01-25 17:21:57
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 7)
('WHALE.', 1)
('1.', 1)
('Call', 1)
('years', 1)
('mind', 1)
('long', 1)
('no', 1)
...

-------------------------------------------
Time: 2019-01-25 17:21:58
-------------------------------------------
('MOBY', 1)
('DICK;', 1)
('', 7)
('WHALE.', 1)
('1.', 1)
('Call', 1)
('years', 1)
('mind', 1)
('long', 1)
('no', 1)
...

-------------------------------------------
Time: 2019-01-25

> **TODO**: Popraw powyższą aplikację tak aby printowane były oczyszczone słowa, spraw aby `początkowy stan` zawierał wpisy: ('moby', 10) oraz ('dick', 10) (opcja w updateStateByKey)

In [18]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)
ssc.checkpoint("tmp")

In [19]:
lines = ssc.socketTextStream("localhost", 9999)
start = sc.parallelize([('moby',10),('dick',10)])

In [20]:
def updateFunc(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
    return sum(newValues, runningCount)

In [21]:
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())
pairs = words.map(lambda word: (word.lower().strip("?!,.-;:"), 1))

In [22]:
runningCounts = pairs.updateStateByKey(updateFunc, initialRDD = start)
runningCounts.pprint()

In [23]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:30:31
-------------------------------------------
('moby', 11)
('dick', 11)

-------------------------------------------
Time: 2019-01-25 17:30:32
-------------------------------------------
('moby', 11)
('dick', 11)

-------------------------------------------
Time: 2019-01-25 17:30:33
-------------------------------------------
('moby', 11)
('dick', 11)

-------------------------------------------
Time: 2019-01-25 17:30:34
-------------------------------------------
('moby', 11)
('dick', 11)
('or', 1)
('the', 1)
('whale', 1)

-------------------------------------------
Time: 2019-01-25 17:30:35
-------------------------------------------
('moby', 11)
('dick', 11)
('or', 1)
('the', 1)
('whale', 1)

-------------------------------------------
Time: 2019-01-25 17:30:36
-------------------------------------------
('moby', 11)
('dick', 11)
('or', 1)
('the', 1)
('whale', 1)

-------------------------------------------
Time: 20

-------------------------------------------
Time: 2019-01-25 17:31:07
-------------------------------------------
('moby', 11)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:31:08
-------------------------------------------
('moby', 11)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:31:09
-------------------------------------------
('moby', 11)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:31:10
-------------------------------------------
('moby', 11)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

----------------

# 3 aplikacja

In [24]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)
ssc.checkpoint("tmp")

In [25]:
lines = ssc.socketTextStream("localhost", 9999)

In [26]:
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())

In [27]:
pairs = words.map(lambda word: (word.lower().strip("?!,.-;:"),1))
windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
windowedWordCounts.pprint()

In [28]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:32:29
-------------------------------------------
('moby', 1)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:32:39
-------------------------------------------
('moby', 1)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:32:49
-------------------------------------------
('moby', 1)
('chapter', 1)
('1', 1)
('call', 1)
('years', 1)
('ago', 1)
('never', 1)
('mind', 1)
('long', 1)
('precisely', 1)
...

-------------------------------------------
Time: 2019-01-25 17:32:59
-------------------------------------------
('i', 9)
('thought', 1)
('would', 1)
('watery', 1)
('of', 12)
('world', 1)
('is', 7)
('way', 1)
('have', 1)
('regulating', 1)
...

--------------------

> **TODO**: Zmodyfikuj powyższą aplikację tak aby printowane były county **słów** dłuższych oraz krótszych od 4

In [29]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)
ssc.checkpoint("tmp")

In [30]:
lines = ssc.socketTextStream("localhost", 9999)

In [31]:
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())

In [32]:
pairs = words.map(lambda word: word.lower().strip("?!,.-;:"))\
.map(lambda word: (">4" if len(word) > 4 else "<=4", 1))
windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10)
windowedWordCounts.pprint()

In [33]:
ssc.start()
ssc.awaitTermination(60)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:34:10
-------------------------------------------
('<=4', 20)
('>4', 14)

-------------------------------------------
Time: 2019-01-25 17:34:20
-------------------------------------------
('<=4', 101)
('>4', 61)

-------------------------------------------
Time: 2019-01-25 17:34:30
-------------------------------------------
('<=4', 164)
('>4', 107)

-------------------------------------------
Time: 2019-01-25 17:34:40
-------------------------------------------
('<=4', 205)
('>4', 134)

-------------------------------------------
Time: 2019-01-25 17:34:50
-------------------------------------------
('<=4', 189)
('>4', 127)

-------------------------------------------
Time: 2019-01-25 17:35:00
-------------------------------------------
('<=4', 207)
('>4', 115)



# 4 aplikacja

In [34]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)

`words.txt` to przetowrzony [Opinion Lexicon](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon)

In [35]:
sentyment = sc.textFile("words.txt").map(lambda x: x.split(",")).map(lambda x: (x[0],x[1]))

In [36]:
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())

In [37]:
wordcounts = words.map(lambda word: (word.lower().strip("?!,.-;:"),1)).reduceByKey(lambda x,y: x+y)

In [38]:
WCS = wordcounts.transform(lambda rdd: sentyment.rightOuterJoin(rdd))
WCS.pprint()

In [39]:
ssc.start()
ssc.awaitTermination(120)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:35:44
-------------------------------------------
('dick', ('negative', 1))
('moby', (None, 1))

-------------------------------------------
Time: 2019-01-25 17:35:45
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:35:46
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:35:47
-------------------------------------------
('the', (None, 1))
('whale', (None, 1))
('or', (None, 1))

-------------------------------------------
Time: 2019-01-25 17:35:48
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:35:49
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:35:50
-------------------------------------------
('chapter', (None, 1))
('1', (None, 1))
('loomings', (None, 1))

---------------------------

-------------------------------------------
Time: 2019-01-25 17:36:15
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:36:16
-------------------------------------------
('of', (None, 1))
('afternoon', (None, 1))
('go', (None, 1))
('the', (None, 1))
('city', (None, 1))
('a', (None, 1))
('sabbath', (None, 1))
('from', (None, 1))
('corlears', (None, 1))
('dreamy', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:36:17
-------------------------------------------
('coenties', (None, 1))
('to', (None, 1))
('slip', (None, 1))
('and', (None, 1))
('from', (None, 1))
('by', (None, 1))
('hook', (None, 1))
('thence', (None, 1))
('whitehall', (None, 1))
('northward', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:36:18
-------------------------------------------
('like', ('positive', 1))
('do', (None, 1))
('you', (None, 1))
('posted', (None, 1))
('sentinels', (None, 1))
('the'

-------------------------------------------
Time: 2019-01-25 17:36:43
-------------------------------------------
('ever', (None, 1))
('in', (None, 1))
('great', ('positive', 1))
('should', (None, 1))
('you', (None, 1))
('be', (None, 1))
('the', (None, 1))
('try', (None, 1))
('this', (None, 1))
('desert', ('negative', 1))
...

-------------------------------------------
Time: 2019-01-25 17:36:44
-------------------------------------------
('happen', (None, 1))
('to', (None, 1))
('be', (None, 1))
('supplied', (None, 1))
('a', (None, 1))
('experiment', (None, 1))
('metaphysical', (None, 1))
('if', (None, 1))
('your', (None, 1))
('caravan', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:36:45
-------------------------------------------
('yes', (None, 1))
('water', (None, 1))
('are', (None, 1))
('every', (None, 1))
('knows', (None, 1))
('and', (None, 1))
('as', (None, 1))
('meditation', (None, 1))
('wedded', (None, 1))
('professor', (None, 1))
...

--------

-------------------------------------------
Time: 2019-01-25 17:37:10
-------------------------------------------
('of', (None, 1))
('surely', (None, 1))
('meaning', (None, 1))
('and', (None, 1))
('without', (None, 1))
('own', (None, 1))
('brother', (None, 1))
('jove', (None, 1))
('this', (None, 1))
('is', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:37:11
-------------------------------------------
('deeper', (None, 1))
('meaning', (None, 1))
('of', (None, 2))
('and', (None, 1))
('still', (None, 1))
('the', (None, 1))
('that', (None, 1))
('who', (None, 1))
('story', (None, 1))
('narcissus', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:37:12
-------------------------------------------
('he', (None, 2))
('in', (None, 1))
('the', (None, 2))
('mild', (None, 1))
('saw', (None, 1))
('grasp', (None, 1))
('image', (None, 1))
('fountain', (None, 1))
('could', (None, 1))
('not', (None, 1))
...

-------------------------------

-------------------------------------------
Time: 2019-01-25 17:37:37
-------------------------------------------
('of', (None, 1))
('in', (None, 1))
('their', (None, 1))
('the', (None, 1))
('mummies', (None, 1))
('creatures', (None, 1))
('huge', (None, 1))
('pyramids', (None, 1))
('those', (None, 1))
('bake-houses', (None, 1))

-------------------------------------------
Time: 2019-01-25 17:37:38
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:37:39
-------------------------------------------
('right', ('positive', 1))
('no', (None, 1))
('when', (None, 1))
('i', (None, 2))
('go', (None, 2))
('sea', (None, 1))
('to', (None, 1))
('a', (None, 1))
('the', (None, 1))
('mast', (None, 1))
...

-------------------------------------------
Time: 2019-01-25 17:37:40
-------------------------------------------
('aloft', (None, 1))
('there', (None, 1))
('the', (None, 2))
('to', (None, 1))
('plumb', (None, 1))
('down', (None, 1))
('into',

> **TODO**: Zmodyfikuj powyższą aplikację tak aby zwracała liczbę wystąpień słów pozytywnych, negatywnych i neutralnych

In [40]:
sc = SparkContext("local[2]", "NetworkWC")
ssc = StreamingContext(sc, 1)

In [41]:
sentyment = sc.textFile("words.txt").map(lambda x: x.split(",")).map(lambda x: (x[0],x[1]))

In [42]:
lines = ssc.socketTextStream("localhost", 9999)
words = lines.flatMap(lambda line: line.split(" ")).flatMap(lambda line: line.split("—"))\
.filter(lambda x: x != "" and not x.isdigit())

In [43]:
pairs = words.map(lambda word: (word.lower().strip("?!,.-;:"),1))\
.transform(lambda rdd: sentyment.rightOuterJoin(rdd))
PS = pairs.map(lambda x: (x[1][0] if x[1][0] != None else 'neutral',x[1][1])).reduceByKey(lambda x,y: x+y)

In [44]:
PS.pprint()

In [45]:
ssc.start()
ssc.awaitTermination(120)
ssc.stop(True,True)

-------------------------------------------
Time: 2019-01-25 17:38:03
-------------------------------------------
('negative', 1)
('neutral', 1)

-------------------------------------------
Time: 2019-01-25 17:38:04
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:38:05
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:38:06
-------------------------------------------
('neutral', 3)

-------------------------------------------
Time: 2019-01-25 17:38:07
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:38:08
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:38:09
-------------------------------------------
('neutral', 3)

-------------------------------------------
Time: 2019-01-25 17:38:10
-------------------------------------------

-------------------

-------------------------------------------
Time: 2019-01-25 17:39:05
-------------------------------------------
('neutral', 1)

-------------------------------------------
Time: 2019-01-25 17:39:06
-------------------------------------------

-------------------------------------------
Time: 2019-01-25 17:39:07
-------------------------------------------
('neutral', 13)

-------------------------------------------
Time: 2019-01-25 17:39:08
-------------------------------------------
('positive', 2)
('neutral', 10)

-------------------------------------------
Time: 2019-01-25 17:39:09
-------------------------------------------
('neutral', 13)

-------------------------------------------
Time: 2019-01-25 17:39:10
-------------------------------------------
('negative', 1)
('neutral', 14)

-------------------------------------------
Time: 2019-01-25 17:39:11
-------------------------------------------
('neutral', 13)

-------------------------------------------
Time: 2019-01-25 17:39:1

-------------------------------------------
Time: 2019-01-25 17:40:05
-------------------------------------------



# *SPARK STUCTURED STREAMING*

In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower, udf, window, min, max, avg

# 1 aplikacja

In [47]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [48]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()

In [49]:
words = lines.select(explode(split(lines.value, " ")).alias("word"))
wordCounts = words.groupBy("word").count()

In [50]:
query = wordCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()

> **TODO**: Zmodyfikuj powyższą aplikację tak aby zwracała liczbę wystąpień oczyszczonych **słów** (konieczne będzie zastosowanie udf)

In [51]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [52]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()

In [53]:
def cleaning(string):
    x = string.strip("?!,.-;:")
    if x.isdigit():
        return ""
    else:
        return x
    
cleaning = udf(cleaning)

In [54]:
one = lines.select(explode(split(lower(lines.value), " ")).alias("word"))
two = one.select(explode(split(one.word, "—")).alias("word"))
three = two.select(cleaning(two.word).alias("word"))
words = three.filter(three.word != "")

wordCounts = words.groupBy("word").count()

In [55]:
query = wordCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()

# 2 aplikacja

Tak wyglądać powinien schemat danych:

In [None]:
#from pyspark.sql.types import *
#schemat = StructType([
#    StructField("age", DoubleType(), True),
#    StructField("workclass", StringType(), True),
#    StructField("fnlwgt", DoubleType(), True),
#    StructField("education", StringType(), True),
#    StructField("education-num", DoubleType(), True),
#    StructField("marital-status", StringType(), True),
#    StructField("occupation", StringType(), True),
#    StructField("relationship", StringType(), True),
#    StructField("race", StringType(), True),
#    StructField("sex", StringType(), True),
#    StructField("capital-gain", DoubleType(), True),
#    StructField("capital-loss", DoubleType(), True),
#    StructField("hours-per-week", DoubleType(), True),
#    StructField("native-country", StringType(), True),
#    StructField("target", StringType(), True)#
#])

In [56]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [57]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9998).load()
lin = lines.withColumn("value", split(lines.value, ", "))
linn = lin.select(
    lin.value.getItem(0).cast("float").alias("age"),
    lin.value.getItem(1).alias("workclass"),
    lin.value.getItem(2).cast("float").alias("fnlwgt"),
    lin.value.getItem(3).alias("education"),
    lin.value.getItem(4).cast("float").alias("education-num"),
    lin.value.getItem(5).alias("marital-status"),
    lin.value.getItem(6).alias("occupation"),
    lin.value.getItem(7).alias("relationship"),
    lin.value.getItem(8).alias("race"),
    lin.value.getItem(9).alias("sex"),
    lin.value.getItem(10).cast("float").alias("capital-gain"),
    lin.value.getItem(11).cast("float").alias("capital-loss"),
    lin.value.getItem(12).cast("float").alias("hours-per-week"),
    lin.value.getItem(13).alias("native-country"),
    lin.value.getItem(14).alias("target"))
lineCounts = linn.groupBy('workclass').avg("age", "hours-per-week")

In [58]:
query = lineCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()

> **TODO**: Napisz aplikację liczącą średnią "capital-gain" i "capital-loss" dla płci i edukacji

In [59]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [60]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9998).load()
lin = lines.withColumn("value", split(lines.value, ", "))
linn = lin.select(
    lin.value.getItem(0).cast("float").alias("age"),
    lin.value.getItem(1).alias("workclass"),
    lin.value.getItem(2).cast("float").alias("fnlwgt"),
    lin.value.getItem(3).alias("education"),
    lin.value.getItem(4).cast("float").alias("education-num"),
    lin.value.getItem(5).alias("marital-status"),
    lin.value.getItem(6).alias("occupation"),
    lin.value.getItem(7).alias("relationship"),
    lin.value.getItem(8).alias("race"),
    lin.value.getItem(9).alias("sex"),
    lin.value.getItem(10).cast("float").alias("capital-gain"),
    lin.value.getItem(11).cast("float").alias("capital-loss"),
    lin.value.getItem(12).cast("float").alias("hours-per-week"),
    lin.value.getItem(13).alias("native-country"),
    lin.value.getItem(14).alias("target"))
lineCounts = linn.groupBy(["sex","education"]).avg("capital-gain", "capital-loss")

In [61]:
query = lineCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()

# 3 aplikacja

In [62]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [63]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9998)\
.option("includeTimestamp", True).load()
lin = lines.withColumn("value", split(lines.value, ", "))
linn = lin.select(
    lin.value.getItem(0).cast("float").alias("age"),
    lin.value.getItem(1).alias("workclass"),
    lin.value.getItem(2).cast("float").alias("fnlwgt"),
    lin.value.getItem(3).alias("education"),
    lin.value.getItem(4).cast("float").alias("education-num"),
    lin.value.getItem(5).alias("marital-status"),
    lin.value.getItem(6).alias("occupation"),
    lin.value.getItem(7).alias("relationship"),
    lin.value.getItem(8).alias("race"),
    lin.value.getItem(9).alias("sex"),
    lin.value.getItem(10).cast("float").alias("capital-gain"),
    lin.value.getItem(11).cast("float").alias("capital-loss"),
    lin.value.getItem(12).cast("float").alias("hours-per-week"),
    lin.value.getItem(13).alias("native-country"),
    lin.value.getItem(14).alias("target"),
    lin.timestamp.alias("timestamp"))
lineCounts = linn.groupBy(window("timestamp", "20 seconds", "10 seconds")).avg("hours-per-week")

In [64]:
query = lineCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()

> **TODO**: Napisz aplikację liczącą średnią, maximum i minimum "hours-per-week" dla rasy w oknach 20 sekundowych z krokiem 10 sekundowym

In [65]:
spark = SparkSession.builder.appName("Structured").getOrCreate()

In [66]:
lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9998)\
.option("includeTimestamp", True).load()
lin = lines.withColumn("value", split(lines.value, ", "))
linn = lin.select(
    lin.value.getItem(0).cast("float").alias("age"),
    lin.value.getItem(1).alias("workclass"),
    lin.value.getItem(2).cast("float").alias("fnlwgt"),
    lin.value.getItem(3).alias("education"),
    lin.value.getItem(4).cast("float").alias("education-num"),
    lin.value.getItem(5).alias("marital-status"),
    lin.value.getItem(6).alias("occupation"),
    lin.value.getItem(7).alias("relationship"),
    lin.value.getItem(8).alias("race"),
    lin.value.getItem(9).alias("sex"),
    lin.value.getItem(10).cast("float").alias("capital-gain"),
    lin.value.getItem(11).cast("float").alias("capital-loss"),
    lin.value.getItem(12).cast("float").alias("hours-per-week"),
    lin.value.getItem(13).alias("native-country"),
    lin.value.getItem(14).alias("target"),
    lin.timestamp.alias("timestamp"))
lineCounts = linn.groupBy(window("timestamp", "20 seconds", "10 seconds"), "race")\
.agg(avg("hours-per-week"),max("hours-per-week"),min("hours-per-week"))

In [67]:
query = lineCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination(60)
query.stop()