In [1]:
# Example weather station data
#
# 1419408015	0R1,Dn=059D,Dm=066D,Dx=080D,Sn=8.5M,Sm=9.5M,Sx=10.3M
# 1419408016	0R1,Dn=059D,Dm=065D,Dx=078D,Sn=8.5M,Sm=9.5M,Sx=10.3M
# 1419408016	0R2,Ta=13.9C,Ua=28.5P,Pa=889.9H
# 1419408017	0R1,Dn=059D,Dm=064D,Dx=075D,Sn=8.7M,Sm=9.6M,Sx=10.3M
# 1419408018	0R1,Dn=059D,Dm=064D,Dx=075D,Sn=8.9M,Sm=9.6M,Sx=10.3M
# 1419408019	0R1,Dn=059D,Dm=065D,Dx=075D,Sn=8.8M,Sm=9.5M,Sx=10.3M

In [2]:
# Key for measurements:
#
# Sn      Wind speed minimum m/s, km/h, mph, knots #,M, K, S, N
# Sm      Wind speed average m/s, km/h, mph, knots #,M, K, S, N
# Sx      Wind speed maximum m/s, km/h, mph, knots #,M, K, S, N
# Dn      Wind direction minimum deg #, D
# Dm      Wind direction average deg #, D
# Dx      Wind direction maximum deg #, D
# Pa      Air pressure hPa, Pa, bar, mmHg, inHg #, H, P, B, M, I
# Ta      Air temperature °C, °F #, C, F
# Tp      Internal temperature °C, °F #, C, F
# Ua      Relative humidity %RH #, P
# Rc      Rain accumulation mm, in #, M, I
# Rd      Rain duration s #, S
# Ri      Rain intensity mm/h, in/h #, M, I
# Rp      Rain peak intensity mm/h, in/h #, M, I
# Hc      Hail accumulation hits/cm2, hits/in2, hits #, M, I, H
# Hd      Hail duration s #, S
# Hi      Hail intensity hits/cm2h, hits/in2h, hits/ h #, M, I, H
# Hp      Hail peak intensity hits/cm2h, hits/in2h, hits/ h #, M, I, H
# Th      Heating temperature °C, °F #, C, F
# Vh      Heating voltage V #, N, V, W, F2
# Vs      Supply voltage V V
# Vr      3.5 V ref. voltage V V

In [3]:
# Parse a line of weather station data, returning the average wind direction measurement 
#
import re
def parse(line):
    match = re.search("Dm=(\d+)", line)
    if match:
        val = match.group(1)
        return [int(val)]
    return []

In [4]:
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc,1)

In [6]:
lines = ssc.socketTextStream("rtd.hpwren.ucsd.edu",12028)

In [7]:
vals = lines.flatMap(parse)

In [8]:
window = vals.window(10,5)

In [9]:
def stats(rdd):
    print(rdd.collect())
    if rdd.count() > 0:
        print("max = {}, min = {}".format(rdd.max(),rdd.min()))


In [10]:
window.foreachRDD(lambda rdd: stats(rdd))

In [11]:
ssc.start()

[]
[151, 154, 157, 155, 153]
max = 157, min = 151
[151, 154, 157, 155, 153, 154, 163, 185, 167, 162]
max = 185, min = 151
[154, 163, 185, 167, 162, 162, 167, 172, 184, 190]
max = 190, min = 154
[162, 167, 172, 184, 190, 194, 196, 179, 186, 186, 179]
max = 196, min = 162
[194, 196, 179, 186, 186, 179, 185, 181, 177, 171, 170]
max = 196, min = 170
[185, 181, 177, 171, 170, 170, 170, 170, 170, 178]
max = 185, min = 170
[170, 170, 170, 170, 178, 167, 166, 166, 166, 167]
max = 178, min = 166
[167, 166, 166, 166, 167, 167, 161, 172, 183, 184]
max = 184, min = 161
[167, 161, 172, 183, 184, 187, 196, 199, 198, 207]
max = 207, min = 161
[187, 196, 199, 198, 207, 224, 233, 233, 221, 213]
max = 233, min = 187
[224, 233, 233, 221, 213, 216, 220, 234, 252, 260]
max = 260, min = 213
[216, 220, 234, 252, 260, 260, 257, 256, 259, 258]
max = 260, min = 216
[260, 257, 256, 259, 258, 257, 248, 240, 223, 206]
max = 260, min = 206
[257, 248, 240, 223, 206, 189, 183, 175, 172, 172]
max = 257, min = 172
[189

In [12]:
ssc.stop()