In [1]:
# Example weather station data
#
# 1419408015	0R1,Dn=059D,Dm=066D,Dx=080D,Sn=8.5M,Sm=9.5M,Sx=10.3M
# 1419408016	0R1,Dn=059D,Dm=065D,Dx=078D,Sn=8.5M,Sm=9.5M,Sx=10.3M
# 1419408016	0R2,Ta=13.9C,Ua=28.5P,Pa=889.9H
# 1419408017	0R1,Dn=059D,Dm=064D,Dx=075D,Sn=8.7M,Sm=9.6M,Sx=10.3M
# 1419408018	0R1,Dn=059D,Dm=064D,Dx=075D,Sn=8.9M,Sm=9.6M,Sx=10.3M
# 1419408019	0R1,Dn=059D,Dm=065D,Dx=075D,Sn=8.8M,Sm=9.5M,Sx=10.3M

In [2]:
# Key for measurements:
#
# Sn      Wind speed minimum m/s, km/h, mph, knots #,M, K, S, N
# Sm      Wind speed average m/s, km/h, mph, knots #,M, K, S, N
# Sx      Wind speed maximum m/s, km/h, mph, knots #,M, K, S, N
# Dn      Wind direction minimum deg #, D
# Dm      Wind direction average deg #, D
# Dx      Wind direction maximum deg #, D
# Pa      Air pressure hPa, Pa, bar, mmHg, inHg #, H, P, B, M, I
# Ta      Air temperature °C, °F #, C, F
# Tp      Internal temperature °C, °F #, C, F
# Ua      Relative humidity %RH #, P
# Rc      Rain accumulation mm, in #, M, I
# Rd      Rain duration s #, S
# Ri      Rain intensity mm/h, in/h #, M, I
# Rp      Rain peak intensity mm/h, in/h #, M, I
# Hc      Hail accumulation hits/cm2, hits/in2, hits #, M, I, H
# Hd      Hail duration s #, S
# Hi      Hail intensity hits/cm2h, hits/in2h, hits/ h #, M, I, H
# Hp      Hail peak intensity hits/cm2h, hits/in2h, hits/ h #, M, I, H
# Th      Heating temperature °C, °F #, C, F
# Vh      Heating voltage V #, N, V, W, F2
# Vs      Supply voltage V V
# Vr      3.5 V ref. voltage V V

In [3]:
# Parse a line of weather station data, returning the average wind direction measurement 
#
import re
def parse(line):
    match = re.search("Dm=(\d+)", line)  #A regular expression is commonly used to search for a pattern in a text.
    #( start a capture group
    #\d a shorthand character class, which matches all numbers; it is the same as [0-9]
    #+ one or more of the expression
    #) end a capture group
    if match:
        val = match.group(1) # return the number
        return [int(val)]
    return []

In [4]:
from pyspark.streaming import StreamingContext

In [5]:
ssc = StreamingContext(sc,1)

In [6]:
lines = ssc.socketTextStream("rtd.hpwren.ucsd.edu", 12028)

In [7]:
vals = lines.flatMap(parse)

In [8]:
window = vals.window(10, 5)

In [9]:
def stats(rdd):
    print(rdd.collect())
    if rdd.count() > 0:
        print("max = {}, min = {}".format(rdd.max(), rdd.min()))

In [10]:
window.foreachRDD(lambda rdd: stats(rdd))

In [11]:
ssc.start()

[]
[106, 106, 108, 108]
max = 108, min = 106
[106, 106, 108, 108, 110, 110, 112, 113, 115]
max = 115, min = 106
[110, 110, 112, 113, 115, 116, 116, 115, 113, 111]
max = 116, min = 110
[116, 116, 115, 113, 111, 110, 108, 107, 105, 103]
max = 116, min = 103
[110, 108, 107, 105, 103, 104, 106, 108, 107, 112]
max = 112, min = 103
[104, 106, 108, 107, 112, 112, 115, 116, 117, 118]
max = 118, min = 104
[112, 115, 116, 117, 118, 115, 113, 114, 115, 110]
max = 118, min = 110
[115, 113, 114, 115, 110, 111, 110, 109, 112, 114]
max = 115, min = 109
[111, 110, 109, 112, 114, 116, 118, 119, 121, 123]
max = 123, min = 109
[116, 118, 119, 121, 123, 123, 123, 122, 118, 114]
max = 123, min = 114
[123, 123, 122, 118, 114, 113, 113, 111, 111, 111]
max = 123, min = 111
[113, 113, 111, 111, 111, 112, 114, 117, 120, 120, 123]
max = 123, min = 111
[112, 114, 117, 120, 120, 123, 124, 126, 128, 127, 126]
max = 128, min = 112
[124, 126, 128, 127, 126, 126, 126, 122, 119, 116]
max = 128, min = 116
[126, 126, 122

In [12]:
ssc.stop()

[112, 112, 112, 114, 115, 116, 118, 120, 123, 125]
max = 125, min = 112
