In [1]:
import numpy as np
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName="Spark1")


In [3]:
# Parameters
partitions = 4

# (K, V) is (issue, Character)
comics = sc.textFile('source.csv').map(lambda line: (line.split('"')[3],
                                       line.split('"')[1]))

# Join version
edges = comics.join(comics).map(lambda x: (x[1][0],
                                x[1][1])).filter(lambda x: x[0] != x[1])
graph = edges.groupByKey().map(lambda x: (x[0], set(list(x[1]))))\
    .partitionBy(partitions).cache()


In [6]:
# Version 3
# SS-BFS Algorithm: only one rdd without collecting/diameter
# TODO: Shuffle optimization done: groupByKey() ==> reduceByKey() and mapValues
def ss_bfs3(bfs_graph, root):
    '''
    Return a (K, (d, [neighbors])) rdd with the nodes as keys and d distance
    from the root as value (d = inf is not connected to the root) and its
    neighbors list in the graph [neighbors].
    Entry graph should be formated as follows:
    (K, (d, [neighbors])), with eventually d = inf for all the nodes if first
    call to ss_bfs3
    '''
    # WARNING: the value need to be a tuple
    current = bfs_graph.filter(lambda x: x[0] == root)\
        .map(lambda x: (x[0], (0, x[1][1])))
    i = 0
    while True:
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        # Redundancy may happen, hence the use of distinct()
        visiting = current.flatMap(lambda x: [(n, (x[1][0] + 1, 1))
                                   for n in list(x[1][1])]).distinct()
        # Updating the current graph
        bfs_graph = bfs_graph.union(visiting).groupByKey()\
            .map(lambda x: (x[0], (min([v[0] for v in list(x[1])]),
                 list(x[1])[0][1])))
        # Need to update the indices before using it in spark
        i += 1
        # Getting the actual visiting nodes (not previously visited)
        current = bfs_graph.filter(lambda x: x[1][0] == i)
        # Check that nodes are left
        if not current.count():
            break
    return bfs_graph

In [13]:
# Version3
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']
# Initialization
# Adding a distance in the entry graph
bfs_graph = graph.mapValues(lambda v: (float('inf'), v))
for root in roots:
    %time bfs = ss_bfs3(bfs_graph, root)
    distance = bfs.filter(lambda x: x[1][0] < float('inf'))
    num_node_visited = distance.count()
    # Substract the root
    print('{} nodes visited for the character {}'.format(num_node_visited - 1,
          root))

[(u'QUESADA, JOE', (1, set([u'ZURI', u'BLACK WIDOW/NATASHA ', u'SMITH, KEVIN', u'OKOYE', u'RALF', u'PALMIOTTI, JIMMY', u'BLAKE, BECKY', u'MYSTERIO/QUENTIN BEC', u'WATSON-PARKER, MARY ', u'SPIDER-MAN/PETER PAR', u'MCKENZIE, LYDIA', u'DR. STRANGE/STEPHEN ', u'LAMY, KELLY', u'URICH, BEN', u"BLACK PANTHER/T'CHAL", u'MR. FANTASTIC/REED R', u'THING/BENJAMIN J. GR', u'QUINN, ASHLEY', u'CAPTAIN AMERICA', u'MALICE V/NAKIA', u'NELSON, CANDACE', u'BUTCH', u'EIGHTBALL', u'DAREDEVIL/MATT MURDO', u'ROSS, EVERETT KENNET', u'POTTER, BETSY BEATTY', u'GLADIATOR/MELVIN POT', u'EVERETT, BILL', u'DAKESIAN, NANCI', u'SHARPE, ROSALINDE', u'NELSON, FRANKLIN FOG', u'OSBORN, LIZ ALLAN', u'HUMAN TORCH/JOHNNY S', u'MARTINEZ, ALITHA', u'MILLER, FRANK', u'DARLA', u'INVISIBLE WOMAN/SUE ', u'PAGE, KAREN', u'LEE, STAN'])))]
[(u'SARACEN', (2, set([u'HOFFMAN, ALICE', u'MICROCHIP/LINUS LIEB', u'PUNISHER II/FRANK CA'])))]
[(u'24-HOUR MAN/EMMANUEL', (3, set([u'KILLRAVEN/JONATHAN R', u'OLD SKULL', u"M'SHULLA", u'FROST, CARM

In [14]:
def ss_bfs3(bfs_graph, root, partitions=4):
    '''
    Return a (K, (d, [neighbors])) rdd with the nodes as keys and d distance
    from the root as value (d = inf is not connected to the root) and its
    neighbors list in the graph [neighbors].
    Entry graph should be formated as follows:
    (K, (d, [neighbors])), with eventually d = inf for all the nodes if first
    call to ss_bfs3
    '''
    # WARNING: the value need to be a tuple
    current = bfs_graph.filter(lambda x: x[0] == root)\
        .mapValues(lambda v: (0, v[1]))
    i = 0
    while True:
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        # Redundancy may happen, hence the use of distinct()
        visiting = current.flatMap(lambda x: [(n, (x[1][0] + 1, 1))
                                   for n in list(x[1][1])]).distinct()\
            .partitionBy(partitions)
        # Updating the current graph
        bfs_graph = bfs_graph.union(visiting).groupByKey()\
            .mapValues(lambda v: (min([n[0] for n in list(v)]),
                       list(v)[0][1]))
        # Need to update the indices before using it in spark
        i += 1
        # Getting the actual visiting nodes (not previously visited)
        current = bfs_graph.filter(lambda x: x[1][0] == i)
        # Check that nodes are left
        if not current.count():
            break
    return bfs_graph

In [87]:
def get_path(x):
    value = list(x[1])
    length = [len(n) for n in value]
    ind = min(length)
    path = value[length.index(ind)]
    return (x[0], path)


def next_path(kv, target, acc):
    node = kv[0]
    path, neighbors = kv[1]
    next_paths = []
    for n in neighbors:
        next_paths.append((n, path + [node]))
        if n == target:
            acc.add(1)
    return next_paths


def check_target(x, acc):
    if x[0] == target:
        acc.add(1)


def shortest_path(graph, root, target, partitions=64):
    '''
    Return the shortest_path as a list between the root and the target of the
    graph if the two are connected.
    Otherwise it returns an empty list.
    '''
    # Initialization
    i = 0
    target_found = graph.context.accumulator(0)
    response = []

    # Graph used to store the next nodes to visit.
    # Format is (name, (path_from_root, neighbors))
    next_nodes = graph.filter(lambda x: x[0] == root)\
        .mapValues(lambda v: ([], v))
    # Graph used to store the path from the root for all visited nodes.
    # Format is (name, path_from_root)
    path = graph.context.parallelize([(root, [])]).partitionBy(partitions)
    while target_found.value == 0:
        print("Loop number {}".format(i))
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        visiting = next_nodes\
            .flatMap(lambda x: next_path(x, target, target_found))\
            .partitionBy(partitions)
        print('visiting computed')
        path = visiting.union(path)\
            .groupByKey().map(get_path).partitionBy(partitions)
        print('path computed')
        i += 1
        # Getting only the current visited nodes (not previously visited)
        visiting_ = path.filter(lambda x: len(x[1]) == i)
        # Check if target hit
        visiting_.foreach(lambda x: check_target(x, target_found))
        # print visiting_.take(1)
        # Getting the list of the neighbors of the current nodes as values.
        next_nodes = visiting_.join(graph)
        print('next_nodes computed')
        # print next_nodes.take(1)
        # Correct path if target has been found
        response = path.filter(lambda x: x[0] == target and x[1][0] == root)
        print('response computed')
    return response


In [88]:
root = 'CAPTAIN AMERICA'
target = '24-HOUR MAN/EMMANUEL'

In [89]:
%time path = shortest_path(graph, root, target, partitions=4)

Loop number 0
visiting computed
path computed
next_nodes computed
response computed
Loop number 1
visiting computed
path computed
next_nodes computed
response computed
Loop number 2
visiting computed
path computed
next_nodes computed
response computed
CPU times: user 86 ms, sys: 16.1 ms, total: 102 ms
Wall time: 3.06 s


In [90]:
path.collect()

[(u'24-HOUR MAN/EMMANUEL', [u'CAPTAIN AMERICA', u'HAWK', u'FROST, CARMILLA'])]