## Init Spark and Python

In [1]:
# Import Spark bindings
execfile("/etc/spark/conf/spark_1.6.0_binings.py")

In [2]:
import os
import ais
import time
import numpy as np
import threading
import Queue
from datetime import datetime
import happybase
from Geohash import geohash
from scipy.spatial import distance
from geopy.distance import vincenty

#plotly
import plotly
import plotly.plotly as py  
import plotly.tools as tls   
import plotly.graph_objs as go
import randomcolor
import IPython

In [5]:
# Set distribution mode, appname, and claim resources
master='yarn-client' #"yarn" to run distributed mode in yarn, "local" to run local
#dmode='client' #spark2.0 only
AppName="AIS - streaming Kystverket"
num_executors=2
exec_memory=1 #in GigaByte pr. executor. Tot mem = num_executors*exec_memory
driver_memory=1 #in GigaByte.



#############--==DO NOT EDIT==--###############
from pyspark import SparkConf
sconf=SparkConf()

sconf.set('spark.master',master)
#sconf.set('spark.submit.deployMode',dmode) #spark2.0 only
sconf.set('spark.executor.instances',str(num_executors))#Number of executors
#sconf.set('spark.shuffle.service.enabled',True)
#sconf.set('spark.dynamicAllocation.enabled',True)
sconf.set('spark.executor.memory',str(exec_memory)+'g')
sconf.set('spark.driver.memory',str(driver_memory)+'g')
#sconf.set('spark.executor.cores','2') # number of cores on same worker
sconf.set('spark.app.name',AppName) #Application Name
sconf.set('spark.app.id',AppName)

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext(conf=sconf)
###############################################

## Streaming Context

In [6]:
#STREAM DATA
threads = []
q = Queue.Queue()

if 't_list' in locals() or 'ssc' in locals():
    del t_list
    del ssc
batch_interval=1#Seconds
ssc = StreamingContext(sc, batch_interval)

#Kystverket's open streaming connection:
streaming_host = "153.44.253.27"
streaming_port = 5631

#Set region of interest
#bbox=[lllat,lllon,urlat,urlon]
#bbox=[59.0, 10.224365,59.881444, 11.728791]# <- Oslofjorden 
bbox=[0,0,100,100]

#Dump data or not
dump_data=False

threads.append(threading.Thread(target=spark_stream,\
                            args=(sc, ssc,streaming_host,streaming_port,q,bbox,dump_data)))

t_list=[t.start() for t in threads]





In [3]:
def spark_stream(sc, ssc,streaming_host,streaming_port,q,bbox,dump_data):
    #Connect to stream
    nmea = ssc.socketTextStream(streaming_host, streaming_port)

    # Decode and filter bad messages
    nmea_decoded = nmea.map(lambda x: try_decode(x,bbox))
    nmea_decoded = nmea_decoded.filter(lambda x:x!=[])

    # Connect to HBase and add metadata about vessel
    nmea_decoded = nmea_decoded.map(lambda x: x+[get_meta_from_mmsi(str(x[0]))["P:imo"],\
                                                 get_meta_from_mmsi(str(x[0]))["P:name"],\
                                                 get_meta_from_mmsi(str(x[0]))["P:type"]])

    nmea_decoded = nmea_decoded.filter(lambda x:x[9]!='not_found')
    
    # Split stream 
    nmea_decoded_to_plot=nmea_decoded
    
    #since rdd is small collect and save to local FS
    if dump_data==True:
        nmea_decoded_to_plot.foreachRDD(lambda rdd: q.put(rdd.collect()))
        nmea_decoded.map(lambda x: rdd_list_to_str(x))\
        .foreachRDD(lambda rdd: save_to_local(rdd.collect()))
    else:
        # Collect and put in que
        nmea_decoded.foreachRDD(lambda rdd: q.put(rdd.collect()))

    # Run!
    ssc.start()
    ssc.awaitTermination(timeout=10)

In [13]:
# Stop StreamingContext
ssc.stop(stopSparkContext=False)

In [14]:
# Stop SparkContext
sc.stop()

## Find vessels to follow on map: follow on your laptop now - https://plot.ly/~kentt/181

In [None]:
maxpoints=10
num_ship=25
mmsi_dict=follow_mmsi(q,maxpoints,num_ship)
init_streaming_plot('AISstream',mmsi_dict)

iframe = '<iframe width="1000" height="800" frameborder="0" scrolling="no" src="//plot.ly/~kentt/181.embed"></iframe>'
IPython.display.HTML(iframe)

## Start stream to map

In [None]:
try:
    #init stream data to plot
    #Init streaming
    ship_info_dict,plotly_stream_dict=plotly_stream_init(mmsi_dict)

    while True:

        ship_info_dict_new=acc_points(ship_info_dict.copy(),q.get(),maxpoints)

        #Get updated keys
        for key in mmsi_dict.keys():
            if ship_info_dict_new[key]==ship_info_dict[key]:
                pass
            else:
                ship_info_dict[key]=ship_info_dict_new[key]
                lets_stream({key:ship_info_dict[key]},{key:plotly_stream_dict[key]},{key:mmsi_dict[key]}) #test_stream,s)#

        time.sleep(1)

except KeyboardInterrupt:
    plotly_stream_close(plotly_stream_dict)
    print('Aborting on Ctrl-C, goodbye!')

## User defined functions

In [4]:
#INIT STREAMING PLOT
def init_streaming_plot(figname,mmsi_dict):
    # init mapboxplot, data and layout
    mapbox_access_token = "pk.eyJ1Ijoia2VudHQiLCJhIjoiY2l1dHIyMmsyMDAwZTJ5czlwNTY4c3E2ZCJ9.IcUEo9TXPyTiwMmrEiikvQ"

    data = []
    for key in mmsi_dict.keys():
        data_init=dict(
            type='scattermapbox',
            lon=[],
            lat=[],
            mode='markers',
            marker={'size':10,'color':mmsi_dict[key]['color']},
            stream=mmsi_dict[key]['stream_id'],
            name=mmsi_dict[key]["name"])

        data.append(data_init)

    layout = go.Layout(
        autosize=True,
        hovermode='closest',
        mapbox=dict(
            accesstoken=mapbox_access_token,
            bearing=0,
            center=dict(
                lon=10,
                lat=59
            ),
            pitch=60,
            zoom=7
        ),
        width  = '1000',
        height = '800',
    )

    fig = dict(data=data, layout=layout)
    py.iplot(fig, filename=figname)

def plotly_stream_init(mmsi_dict):
    ship_info_dict={}
    plotly_stream_dict={}
    
    for key in mmsi_dict.keys():
        #Init empty data dict
        ship_info_dict[key]=[[],[],[],[],[],[],[],[],[],[],[]]
        #Init streaming objects
        plotly_stream_dict[key]=py.Stream(stream_id=mmsi_dict[key]["token"])
        plotly_stream_dict[key].open()

    return ship_info_dict,plotly_stream_dict

def plotly_stream_close(plotly_stream_dict):
    for key in plotly_stream_dict.keys():
        plotly_stream_dict[key].close()


#Get list of mmsi's to follow
def follow_mmsi(q,mpoints,num_ship):
    #Accumulate q.get() to get a good base to pick mmsi's
    q_acc=[]
    
    stream_tokens = tls.get_credentials_file()['stream_ids']
    mmsi_stream_token={}
    
    while len(mmsi_stream_token.keys())<num_ship:
        mmsi_list=[]
        q_acc.append(q.get())
        time.sleep(0.3)
        
        for item in q_acc:
            for sub_item in item:
                mmsi_list.append(sub_item[0])

        mmsi_list = list(sorted(set(mmsi_list)))

        #Check HBase if ship name is found:
        for i in range(0,len(mmsi_list)):
            mmsi=mmsi_list[i]

            info_dict=get_meta_from_mmsi(str(mmsi))
            mmsi_list[i]=(mmsi,info_dict['P:name'])

        if len(mmsi_list)>len(stream_tokens):
            N=len(stream_tokens)
        else:
            N=len(mmsi_list)

        for i in range(0,N):
            token=stream_tokens[i]
            mmsi=mmsi_list[i][0]
            name=mmsi_list[i][1]

            if name=='not_found':
                continue
            else:
                mmsi_stream_token[mmsi]={"token":token,"name":name,"mmsi":mmsi,"stream_id":dict(token=token, maxpoints=mpoints),"color":randomcolor.RandomColor().generate()[0]}

    return mmsi_stream_token

def acc_points(ship_info_dict,q,maxpoints):
    for row in q:
        try:
            ship_info=ship_info_dict[row[0]][:]
            for i in range(0,len(ship_info)):
                ship_info[i]=ship_info[i]+[row[i]]

            ship_info_dict[row[0]]=ship_info
        except:
            pass

    #Limit the number of accumulated points
    for key in ship_info_dict.keys():
        if len(ship_info_dict[key][0])>maxpoints:
            for i in range(0,len(ship_info_dict[key])):
                ship_info_dict[key][i]=ship_info_dict[key][i][-maxpoints:]

        else:
            pass


    return ship_info_dict

def lets_stream(ship_info_dict,plotly_stream_dict,mmsi_dict):
    
    for key in mmsi_dict.keys():
        if ship_info_dict[key][0]==[]:
            continue
        
        ais=ship_info_dict[key][:]
        
        col=mmsi_dict[key]["color"]
        
        imo = ais[8]
        name = ais[9]
        stype = ais[10]
        SOG = ais[5]
        COG = ais[7]
        dtime = ais[1]
        x_lon = ais[2]
        y_lat = ais[3] 
        
        plotly_stream_dict[key].write(go.Scattermapbox(lon=x_lon,
                        lat=y_lat,
                        marker=go.Marker(color=[col for i in range(0,len(imo)-1)]+["black"]),
                        text=['IMO: '+str(imo[i])+'<br>'\
                              +'Type: '+str(stype[i])+'<br>'\
                              +'Time: '+datetime.fromtimestamp(int(dtime[i])).strftime('%Y-%m-%d %H:%M:%S')+'<br>'\
                              +'COG: '+str(int(COG[i]))+' deg'+'<br>'\
                              +'SOG: '+str(int(SOG[i]))+' kn' for i in range(0,len(imo))]))
    

    

def identify_nearest_neighbour(rdd):
    try:
        rdd_nn=[]
        for line in rdd:
            mmsi=line[0]
            pos_mmsi=(line[2],line[3])

            #Make list of all neighbours positions
            pos_neighbours=[]
            pos_names=[]
            for row in rdd:
                pos_neighbour=[row[2],row[3]]
                pos_name=row[9]
                if [pos_mmsi[0],pos_mmsi[1]] != pos_neighbour:
                    pos_neighbours.append(pos_neighbour)
                    pos_names.append(pos_name)

            #Find closest point:
            pos_neighbours=np.array(pos_neighbours)
            mmsi_nn_point=pos_neighbours[distance.cdist([pos_mmsi], pos_neighbours).argmin()]
            mmsi_nn_distance=vincenty(pos_mmsi,(mmsi_nn_point[0],mmsi_nn_point[1])).kilometers

            #Find ID of closest point:
            mmsi_nn_name=pos_names[pos_neighbours.tolist().index(mmsi_nn_point.tolist())]

            #Append to RDD
            rdd_nn.append(line+[mmsi_nn_name,mmsi_nn_distance])

        return rdd_nn
    except:
        return rdd

def save_to_local(rdd_collected):
    filen='ais_'+str(int(time.time()))
    dirpath="/STAGING/DATASETS/AIS/"
    dir_name='KV_ais_dump_'+datetime.fromtimestamp(time.time()).strftime('%Y%m%d')
    if not os.path.exists(dirpath+dir_name):
        os.makedirs(dirpath+dir_name)
        os.system("chmod 777 "+dirpath+dir_name)
    
    with open(dirpath+dir_name+'/'+filen, 'w') as file_handler:
        for item in rdd_collected:
            file_handler.write(item+'\n')
    os.system("chmod 777 "+dirpath+dir_name+'/'+filen)
    

def rdd_list_to_str(rdd_list): 
    rdd_str=''
    for el in rdd_list:
        rdd_str=rdd_str+','+str(el)
    
    return rdd_str[1:]

def get_meta_from_mmsi(mmsi):
    #Create connection
    connection = happybase.Connection('2.sherpa.client.sysedata.no')
    connection.open()

    table_name="mmsiShipInfo"
    table = connection.table(table_name)
    info_dict=table.row(mmsi)
    if info_dict=={}:
        info_dict={'P:imo': 'not_found','P:mmsi': mmsi,'P:name': 'not_found','P:type': 'not_found'}
    
    connection.close()
    return info_dict

def try_decode(nmea,bbox):
    #bbox=[lllat,lllon,urlat,urlon]
    try:
        x=decode_nmea_no_prefix(nmea)
        lat=x['y']
        lon=x['x']
        
        if lat > bbox[0] and lat < bbox[2] and lon > bbox[1] and lon < bbox[3]:
            decoded_list=[int(x['mmsi']),x['unixtime'],float(x['x']),float(x['y']),x['geohash'],float(x['sog']),float(x['rot']),float(x['cog'])]
        else:
            decoded_list=[]
            
    except:
        decoded_list=[]

    return decoded_list

def decode_nmea_no_prefix(nmea):
    commasplit=nmea.split(',')
    
    nmea_talkerid=commasplit[1].split('\\')[-1]
    fragment_no=commasplit[3]
    seq_message_id=commasplit[4]
    payload=commasplit[-2]
    fill_bits=int(commasplit[-1][0])

    #Decode ais payload
    msg_type=[]
    try:
        aisdata=ais.decode(payload,fill_bits)
        msg_type=int(aisdata['id'])
    except:
        try:
            fill_bits=2
            aisdata=ais.decode(payload,fill_bits)
            msg_type=int(aisdata['id'])
        except:
            msg_type=30
            aisdata={'id':msg_type}
    if msg_type==20:
        aisdata=unroll_msg20(aisdata)

    if 'x' in aisdata and 'y' in aisdata: # and 'x'!=181 and 'y'!=91: # x- longitude , y- latitude
        try:
            aisdata[u'geohash'] = geohash.encode(aisdata['y'],aisdata['x'],13)
        except:
            aisdata[u'geohash'] = '0'


    #Append NMEA Tag Blocks         
    aisdata[u'unixtime'] = int(time.time()) # since no timestamp is included, set it to utc.now
    aisdata[u'n_talkerid'] = nmea_talkerid
    aisdata[u'n_fragmentno'] = fragment_no
    aisdata[u'n_seqmsg'] = seq_message_id
    aisdata[u'n_aispayload'] = payload
    aisdata[u'n_fillbits'] = fill_bits
   
    return aisdata