# Problem 5

In [6]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.ticker as ticker   


# setup spark
conf = SparkConf().setAppName('BigGraph')
sc = SparkContext(conf=conf)
#from P5_bfs import *

import pandas as pd

In [27]:
import sys
import functools

# put the whole process into a function, given an rdd which holds as elements
# (v, [v_1, ..., v_d]),  a start node v0 and the spark context corresponding to the rdd
def sparkSSSP(context, rdd, v0, vT, max_iter):
    
    # prepare data structure for single source shortest paths
    imaxvalue = sys.maxint

    # currently our rdd looks like (v, [v1, v2, ...]). now we map it to a tuple with
    # (v, <adj. list.>, disttov0, color, <pred. list>) where disttov0 is 0 for v = v0 and imaxvalue else,
    # color = GRAY for v0 and BLACK else
    # WHITE means vertex not visited yet
    # GRAY means vertex is visited in the next hop
    # BLACK mean vertex already visited

    # to speedup we use
    # WHITE = 2
    # GRAY = 1
    # BLACK = 0
    rdd = rdd.map(lambda x: (x[0], x[1], 0 if x[0] == v0 else imaxvalue, 1 if x[0] == v0 else 2))
    
    # map to (K, V) form
    rdd = rdd.map(lambda x: (x[0], (x[1], x[2], x[3], [])))
    
    # helper functions for one hop START
    
    # say we have given (v, [v_1, ..., v_d], d, 'GRAY',  [...])
    # this will be expanded to 
    # (v_1, NULL, d+1, 'GRAY', [..., v])
    # ...
    # (v_d, NULL, d+1, 'GRAY', [..., v])
    # (v, [v_1, ..., v_d], d, 'BLACK', [..., v])
    # in the next step we can then call a reducebykey to update distances/adjacency lists
    def expandNode(x):
        if x[1][2] == 1: # 'GRAY'
        # set current node to visited
            res = []
            res.append( (x[0], (x[1][0], x[1][1], 0, [x[1][3]])) ) # 'BLACK'

            # spawn new GRAY nodes
            for i in range(0, len(x[1][0])):
                res.append( (x[1][0][i], ([], x[1][1] + 1, 1, [x[1][3] + [x[0]]])) ) # 'GRAY'

            return tuple(res)
        else: 
            return [x]

    # in the next step we combine all tuples for the same key returning
    # the minimum distance, longest adjacency list and darkest color
    # and minimum path
    # the algorithm will determine if there is no gray node left
    def reduceNodes(a, b):

        # simple solution for only some shortest path
        #res = (a[0] if len(a[0]) > len(b[0]) else b[0], min(a[1], b[1]), min(a[2], b[2]), a[3] if a[1] < b[1] else b[3])

        #solution for multiple shortest paths is more complicated ==> we store them in a list!
        spath_list = [];

        spath_lista = a[3]
        spath_listb = b[3]

        min_dist = 0

        if a[1] < b[1]:
            spath_list = spath_lista
            min_dist = a[1]
        elif a[1] > b[1]:
            spath_list = spath_listb
            min_dist = b[1]
        else:
            # both are at the same distance --> merge shortest path lists!
            spath_list = spath_lista + spath_listb
            min_dist = a[1]

        # construct tuple
        res = (a[0] if len(a[0]) > len(b[0]) else b[0], min_dist, min(a[2], b[2]), spath_list)

        # return a tuple of 3 entries
        return res
    
    def countGrayNodes(x, gray_accum):
        # inc count of remaining gray nodes by 1!
        if x[1][2] == 1:
            gray_accum.add(1) 

        return x

    # helper functions END
    
    # set num_gray_nodes to 1 to start loop (finished when all nodes are visited)
    num_remaining_gray_nodes = 1;
    num_visited_nodes = 0;
    
    counter = 0
    gray_accum = 0
    while num_remaining_gray_nodes > 0 and counter < max_iter:
    
        # (1) set accumulator for gray nodes to zero
        gray_accum = context.accumulator(0)

        # filter for gray nodes to make it faster
        # ==> look for http://datascience.stackexchange.com/questions/5667/spark-optimally-splitting-a-single-rdd-into-two

        # split dataset into one of all the gray nodes (the ones to visit next) and the remaining ones
        rddGray = rdd.filter(lambda x: x[1][2] == 1)
        rddRest = rdd.filter(lambda x: x[1][2] != 1)

        # (2) start map process
        rddGray = rddGray.flatMap(expandNode)

        rdd = rddRest.union(rddGray)

        # (3) then reduce by key
        rdd = rdd.reduceByKey(reduceNodes)

        # (4) map to count gray nodes
        rdd = rdd.map(functools.partial(countGrayNodes, gray_accum=gray_accum))
        
        # (5) use here a flatmap to solve for all shortest paths (applies only)
        rdd = rdd.flatMap(lambda x: [(x[0], (x[1][0], x[1][1], x[1][2], y)) for y in x[1][3]])
        
        # (6) check if target node vT was reached
        rdd_containing_vT = rdd.filter(lambda x: x[0] == vT and x[1][2] == 1)
        
        # call count to evaluate accumulator correctly
        rdd.count()
        
        # save value of gray node accumulator
        num_remaining_gray_nodes = gray_accum.value
        num_visited_nodes += num_remaining_gray_nodes 
        
        # leave if target was reached
        if rdd_containing_vT.count() > 0:
            counter = max_iter
            break
            
        counter += 1
    
    # map rdd to shortest paths, i.e. return list of paths
    #(first is the start, last is the target (can be also left away, as user has this info already))
    
    rdd = rdd.filter(lambda x: x[0] == vT).map(lambda x: x[1][3] + [x[0]])
    
    # return number of visited nodes and the rdd
    return num_visited_nodes, rdd

## Test Data section

In [28]:
# use the datasets from P4 as test run...

# this function prepares the rdd
def prepare_rdd(context, filename):
    rdd = context.textFile(filename)

    # map string to tuples
    rdd = rdd.map(lambda x: x.split(','))
    rdd = rdd.map(lambda x: (int(x[0]), int(x[1])))
    
    # now group s.t. we have for each vertex an adjacency list of nodes
    rdd = rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
    
    return rdd

In [29]:
import csv

# load vertex dictionary (is basically the character dictionary inverted)
reader = csv.reader(open('characters.csv', 'rb'))
vertexDict = dict(reader)

keys = vertexDict.values()
values = [int(k) for k in vertexDict.keys()]
characterDict = dict(zip(keys, values))



In [30]:
# test code
filename = 'edge_list_simple.csv'#'edge_list_simple.csv' # 'edge_list.csv'

rdd = prepare_rdd(sc, filename)

num_visited_nodes, rdd = sparkSSSP(sc, rdd, 1, 2, 20)

rdd.collect()

[[1, 2]]

In [31]:
num_visited_nodes

2

In [6]:
imaxvalue = sys.maxint
v0 = 1
filename = 'edge_list_simple.csv'#'edge_list_simple.csv' # 'edge_list.csv'
context = sc

rdd = prepare_rdd(sc, filename)

rdd = rdd.map(lambda x: (x[0], x[1], 0 if x[0] == v0 else imaxvalue, 1 if x[0] == v0 else 2))
    
# map to (K, V) form
rdd = rdd.map(lambda x: (x[0], (x[1], x[2], x[3], [])))
    
def expandNode(x):
    if x[1][2] == 1: # 'GRAY'
    # set current node to visited
        res = []
        res.append( (x[0], (x[1][0], x[1][1], 0, [x[1][3]])) ) # 'BLACK'

        # spawn new GRAY nodes
        for i in range(0, len(x[1][0])):
            res.append( (x[1][0][i], ([], x[1][1] + 1, 1, [x[1][3] + [x[0]]])) ) # 'GRAY'

        return tuple(res)
    else: 
        return [x]

# in the next step we combine all tuples for the same key returning
# the minimum distance, longest adjacency list and darkest color
# and minimum path
# the algorithm will determine if there is no gray node left
def reduceNodes(a, b):
    
    # simple solution for only some shortest path
    #res = (a[0] if len(a[0]) > len(b[0]) else b[0], min(a[1], b[1]), min(a[2], b[2]), a[3] if a[1] < b[1] else b[3])

    #solution for multiple shortest paths is more complicated ==> we store them in a list!
    spath_list = [];
    
    spath_lista = a[3]
    spath_listb = b[3]
    
    min_dist = 0
    
    if a[1] < b[1]:
        spath_list = spath_lista
        min_dist = a[1]
    elif a[1] > b[1]:
        spath_list = spath_listb
        min_dist = b[1]
    else:
        # both are at the same distance --> merge shortest path lists!
        spath_list = spath_lista + spath_listb
        min_dist = a[1]
        
    # construct tuple
    res = (a[0] if len(a[0]) > len(b[0]) else b[0], min_dist, min(a[2], b[2]), spath_list)

    # return a tuple of 3 entries
    return res

def countGrayNodes(x, gray_accum):
    # inc count of remaining gray nodes by 1!
    if x[1][2] == 1:
        gray_accum.add(1) 

    return x

# helper functions END

# set num_gray_nodes to 1 to start loop (finished when all nodes are visited)
num_remaining_gray_nodes = 1;
num_visited_nodes = 0;

counter = 0
gray_accum = 0


# (1) set accumulator for gray nodes to zero
gray_accum = context.accumulator(0)

# filter for gray nodes to make it faster
# ==> look for http://datascience.stackexchange.com/questions/5667/spark-optimally-splitting-a-single-rdd-into-two

# split dataset into one of all the gray nodes (the ones to visit next) and the remaining ones
rddGray = rdd.filter(lambda x: x[1][2] == 1).cache()
rddRest = rdd.filter(lambda x: x[1][2] != 1)

# (2) start map process
rddGray = rddGray.flatMap(expandNode)

rdd = rddRest.union(rddGray).cache()

# (3) then reduce by key
rdd = rdd.reduceByKey(reduceNodes).cache()

# (4) map to count gray nodes
rdd = rdd.map(functools.partial(countGrayNodes, gray_accum=gray_accum))

# (5) use here a flatmap to solve for all shortest paths (applies only)
rdd = rdd.flatMap(lambda x: [(x[0], (x[1][0], x[1][1], x[1][2], y)) for y in x[1][3]])



# call count to evaluate accumulator correctly
rdd.count()

# save value of gray node accumulator
num_remaining_gray_nodes = gray_accum.value
num_visited_nodes += num_remaining_gray_nodes 
        
rdd.collect()
#test =rdd.take(5)[3]
#test

#expandNode(test)

[(1, ([2, 5], 0, 0, [])), (5, ([], 1, 1, [1])), (2, ([], 1, 1, [1]))]

In [27]:
num_visited_nodes

6

In [28]:
num_visited_nodes, rdd = sparkBFS(sc, rdd, 1)

In [29]:
rdd.collect()

[(2, ([1, 5, 3, 4], 1, 0, 1)),
 (3, ([], 2, 0, [1, 5])),
 (3, ([], 2, 0, [1, 2])),
 (4, ([], 2, 0, [1, 5])),
 (4, ([], 2, 0, [1, 2])),
 (5, ([4, 1, 2, 3], 1, 0, 1))]

In [10]:
v0 = [characterDict['CAPTAIN AMERICA'], characterDict['MISS THING/MARY'], characterDict['ORWELL']]

In [11]:
v0

[15, 1621, 3430]

In [None]:
v0 = [characterDict['CAPTAIN AMERICA'], characterDict['MISS THING/MARY'], characterDict['ORWELL']]
filename = 'edge_list.csv'
#for i in range(0, len(v0)):
i = 0
rdd = prepare_rdd(sc, filename)
num_visited_nodes, rdd = sparkBFS(sc, rdd, v0[i])

print('%s : %d nodes visited' % (vertexDict[str(v0[i])], num_visited_nodes))

In [None]:
rdd.collect()