# Problem 4

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.ticker as ticker   


# setup spark
conf = SparkConf().setAppName('Graph Processing')
sc = SparkContext(conf=conf)

In [2]:
import pandas as pd

In [3]:
comicdf = pd.read_csv('source.csv', names=['Character', 'ComicIssue'])
comicdf.groupby('ComicIssue')

<pandas.core.groupby.DataFrameGroupBy object at 0x10d8b47d0>

In [4]:
keys = comicdf['Character'].unique()

In [5]:
values = range(1, len(keys)+1) # start indices with 1

In [6]:
characterDict = dict(zip(keys, values))

In [7]:
characterDict['FROST, CARMILLA']

1

In [8]:
# create graph by adjacency list (there is a connection between
# two characters if they appear in the same comic issue)

# therefore join the comicdf on comic issue!
mergeddf = pd.merge(comicdf, comicdf, how='inner', on='ComicIssue')

# now remove ComicIssue & all rows with Character_x == Character_y
filtereddf = mergeddf.drop('ComicIssue', 1)
filtereddf = filtereddf[filtereddf['Character_x'] != filtereddf['Character_y']]

In [9]:
# transform string to integers for performance reasons
edgedf = filtereddf.applymap(lambda x: characterDict[x])

In [10]:
import csv

# save the edge list and the dictionary as two separate csv's
edgedf.to_csv('edge_list.csv', header=False, index=False)

writer = csv.writer(open('characters.csv', 'wb'))
entries = sorted(characterDict.items(), key=lambda x: x[1]);
for key, value in entries:
    writer.writerow([value, key]) # flip it (so the vertex index is now the key)

In [11]:
# load vertex dictionary (is basically the character dictionary inverted)
reader = csv.reader(open('characters.csv', 'rb'))
vertexDict = dict(reader)

In [12]:
# implementation after 
# http://www.johnandcailin.com/blog/cailin/breadth-first-graph-search-using-iterative-map-reduce-algorithm
# now load data into spark!
rdd = sc.textFile('edge_list_simple.csv')

# to avoid overhead by checking everytime for gray nodes via collect()
# we use an accumulator!
num_gray_nodes = sc.accumulator(0)

In [13]:
# map string to tuples
rdd = rdd.map(lambda x: x.split(','))

In [14]:
rdd = rdd.map(lambda x: (int(x[0]), int(x[1])))

In [15]:
rdd.take(5)

[(1, 2), (1, 5), (2, 1), (2, 5), (2, 3)]

In [16]:
# now group s.t. we have for each vertex an adjacency list of nodes
rdd = rdd.groupByKey().map(lambda x: (x[0], list(x[1])))

In [17]:
rdd.take(5)

[(2, [1, 5, 3, 4]), (4, [2, 5, 3]), (6, [7]), (1, [2, 5]), (3, [2, 4])]

In [18]:
import sys
import functools
# init bfs
v0 = 1 # start vertex
imaxvalue = sys.maxint

# currently our rdd looks like (v, [v1, v2, ...]). now we map it to a tuple with
# (v, <adj. list.>, disttov0, color) where disttov0 is 0 for v = v0 and imaxvalue else, color = GRAY for v0 and BLACK else
# WHITE means vertex not visited yet
# GRAY means vertex is visited in the next hop
# BLACK mean vertex already visited

# to speedup we use
# WHITE = 2
# GRAY = 1
# BLACK = 0
rdd = rdd.map(lambda x: (x[0], x[1], 0 if x[0] == v0 else imaxvalue, 1 if x[0] == v0 else 2))

In [19]:
rdd.take(5)

[(2, [1, 5, 3, 4], 9223372036854775807, 2),
 (4, [2, 5, 3], 9223372036854775807, 2),
 (6, [7], 9223372036854775807, 2),
 (1, [2, 5], 0, 1),
 (3, [2, 4], 9223372036854775807, 2)]

In [20]:
# now we perform one hop, i.e. we expand all gray notes
# say we have given (v, [v_1, ..., v_d], d, 'GRAY')
# this will be expanded to 
# (v_1, NULL, d+1, 'GRAY')
# ...
# (v_d, NULL, d+1, 'GRAY')
# (v, [v_1, ..., v_d], d, 'BLACK')
# in the next step we can then call a reducebykey to update distances/adjacency lists
def expandNode(x):
    if x[3] == 1: # 'GRAY'
    # set current node to visited
        res = []
        res.append( (x[0], x[1], x[2], 0) ) # 'BLACK'

        # spawn new GRAY nodes
        for i in range(0, len(x[1])):
            res.append( (x[1][i], [], x[2] + 1, 1) ) # 'GRAY'

        return tuple(res)
    else: 
        return [x]
    
rdd = rdd.flatMap(expandNode);

In [21]:
rdd.take(10)

[(2, [1, 5, 3, 4], 9223372036854775807, 2),
 (4, [2, 5, 3], 9223372036854775807, 2),
 (6, [7], 9223372036854775807, 2),
 (1, [2, 5], 0, 0),
 (2, [], 1, 1),
 (5, [], 1, 1),
 (3, [2, 4], 9223372036854775807, 2),
 (5, [4, 1, 2], 9223372036854775807, 2),
 (7, [6], 9223372036854775807, 2)]

In [22]:
# in the next step we combine all tuples for the same key returning the minimum distance, longest adjacency list and darkest color
# the algorithm will determine if there is no gray node left
def reduceNodes(a, b):
    res = (a[0] if len(a[0]) > len(b[0]) else b[0], \
            min(a[1], b[1]), min(a[2], b[2]))
    
    if res[2] == 1:
        num_gray_nodes.add(1) # inc count of remaining gray nodes by 1!
    
    # return a tuple of 3 entries
    return res
    
    
# map first to a (key, value) pair
rdd = rdd.map(lambda x: (x[0], (x[1], x[2], x[3])))

# then reduce by key
rdd = rdd.reduceByKey(reduceNodes)

In [23]:
rdd.take(5)

[(2, ([1, 5, 3, 4], 1, 1)),
 (4, ([2, 5, 3], 9223372036854775807, 2)),
 (6, ([7], 9223372036854775807, 2)),
 (1, ([2, 5], 0, 0)),
 (3, ([2, 4], 9223372036854775807, 2))]

In [24]:
num_gray_nodes.value

2

In [38]:
# put the whole process into a function, given an rdd which holds as elements
# (v, [v_1, ..., v_d]),  a start node v0 and the spark context corresponding to the rdd
def sparkBFS(context, rdd, v0):
    
    # prepare data structure for single source shortest paths
    imaxvalue = sys.maxint

    # currently our rdd looks like (v, [v1, v2, ...]). now we map it to a tuple with
    # (v, <adj. list.>, disttov0, color) where disttov0 is 0 for v = v0 and imaxvalue else,
    # color = GRAY for v0 and BLACK else
    # WHITE means vertex not visited yet
    # GRAY means vertex is visited in the next hop
    # BLACK mean vertex already visited

    # to speedup we use
    # WHITE = 2
    # GRAY = 1
    # BLACK = 0
    rdd = rdd.map(lambda x: (x[0], \
                             x[1], \
                             0 if x[0] == v0 else imaxvalue, \
                             1 if x[0] == v0 else 2))
    
    # map to (K, V) form
    rdd = rdd.map(lambda x: (x[0], (x[1], x[2], x[3])))
    
    # helper functions for one hop START
    
    # say we have given (v, [v_1, ..., v_d], d, 'GRAY')
    # this will be expanded to 
    # (v_1, NULL, d+1, 'GRAY')
    # ...
    # (v_d, NULL, d+1, 'GRAY')
    # (v, [v_1, ..., v_d], d, 'BLACK')
    # in the next step we can then call a reducebykey to update distances/adjacency lists
    def expandNode(x):
        if x[1][2] == 1: # 'GRAY'
        # set current node to visited
            res = []
            res.append( (x[0], (x[1][0], x[1][1], 0)) ) # 'BLACK'

            # spawn new GRAY nodes
            for i in range(0, len(x[1][0])):
                res.append( (x[1][0][i], ([], x[1][1] + 1, 1)) ) # 'GRAY'

            return tuple(res)
        else: 
            return [x]
        
    # in the next step we combine all tuples for the same key returning
    # the minimum distance, longest adjacency list and darkest color
    # the algorithm will determine if there is no gray node left
    def reduceNodes(a, b, gray_accum):
        res = (a[0] if len(a[0]) > len(b[0]) else b[0], \
                min(a[1], b[1]), min(a[2], b[2]))

        if res[2] == 1:
            gray_accum.add(1) # inc count of remaining gray nodes by 1!

        # return a tuple of 3 entries
        return res
    
    # helper functions END
    
    # set num_gray_nodes to 1 to start loop (finished when all nodes are visited)
    num_remaining_gray_nodes = 1;
    num_visited_nodes = 0;
    
    rdd.take(5)
    
    counter = 0
    gray_accum = 0
    while num_remaining_gray_nodes > 0:# and counter < 2:
    
        # (1) set accumulator for gray nodes to zero
        gray_accum = sc.accumulator(0)
        
        # (2) start map process
        rdd = rdd.flatMap(expandNode);

        # (3) then reduce by key
        rdd = rdd.reduceByKey(functools.partial(reduceNodes, gray_accum=gray_accum))

        # call count to evaluate accumulator correctly
        rdd.count()
        
        # save value of gray node accumulator
        num_remaining_gray_nodes = gray_accum.value
        num_visited_nodes += num_remaining_gray_nodes
            
        counter += 1
        
    
    # return number of visited nodes and the rdd
    return num_visited_nodes, rdd
    

# HIER NOCH AUF (KEY, VALUE) UMSTELLEN UM SINNLOSES MAPPING ZU SPAREN!!!

In [39]:
# this function prepares the rdd
def prepare_rdd(filename):
    rdd = sc.textFile(filename)

    # map string to tuples
    rdd = rdd.map(lambda x: x.split(','))
    rdd = rdd.map(lambda x: (int(x[0]), int(x[1])))
    
    # now group s.t. we have for each vertex an adjacency list of nodes
    rdd = rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
    
    return rdd

In [40]:
# test code
filename = 'edge_list_simple.csv' # 'edge_list.csv'

rdd = prepare_rdd(filename)
rdd.take(5)

num_visited_nodes, rdd = sparkBFS(sc, rdd, 1)

In [41]:
rdd.collect()

[(2, ([1, 5, 3, 4], 1, 0)),
 (4, ([2, 5, 3], 2, 0)),
 (6, ([7], 9223372036854775807, 2)),
 (1, ([2, 5], 0, 0)),
 (3, ([2, 4], 2, 0)),
 (5, ([4, 1, 2], 1, 0)),
 (7, ([6], 9223372036854775807, 2))]

In [42]:
num_visited_nodes

5