In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
sc = pyspark.SparkContext(appName="Spark1")

In [3]:
# (K, V) is (issue, Character)
comics = sc.textFile('source.csv').map(lambda line: (line.split('"')[3], line.split('"')[1]))


In [4]:
# Join version
edges = comics.join(comics).map(lambda x: (x[1][0],
                                x[1][1])).filter(lambda x: x[0] != x[1])
graph = edges.groupByKey().map(lambda x: (x[0], set(list(x[1]))))


In [5]:
graph.take(1)

[(u'QUESADA, JOE',
  {u"BLACK PANTHER/T'CHAL",
   u'BLACK WIDOW/NATASHA ',
   u'BLAKE, BECKY',
   u'BUTCH',
   u'CAPTAIN AMERICA',
   u'DAKESIAN, NANCI',
   u'DAREDEVIL/MATT MURDO',
   u'DARLA',
   u'DR. STRANGE/STEPHEN ',
   u'EIGHTBALL',
   u'EVERETT, BILL',
   u'GLADIATOR/MELVIN POT',
   u'HUMAN TORCH/JOHNNY S',
   u'INVISIBLE WOMAN/SUE ',
   u'LAMY, KELLY',
   u'LEE, STAN',
   u'MALICE V/NAKIA',
   u'MARTINEZ, ALITHA',
   u'MCKENZIE, LYDIA',
   u'MILLER, FRANK',
   u'MR. FANTASTIC/REED R',
   u'MYSTERIO/QUENTIN BEC',
   u'NELSON, CANDACE',
   u'NELSON, FRANKLIN FOG',
   u'OKOYE',
   u'OSBORN, LIZ ALLAN',
   u'PAGE, KAREN',
   u'PALMIOTTI, JIMMY',
   u'POTTER, BETSY BEATTY',
   u'QUINN, ASHLEY',
   u'RALF',
   u'ROSS, EVERETT KENNET',
   u'SHARPE, ROSALINDE',
   u'SMITH, KEVIN',
   u'SPIDER-MAN/PETER PAR',
   u'THING/BENJAMIN J. GR',
   u'URICH, BEN',
   u'WATSON-PARKER, MARY ',
   u'ZURI'})]

In [6]:
def ss_bfs3(graph, root):
    '''
    Return a (K, (d, [neighbors])) rdd with the nodes as keys and d distance
    from the root as value (d = inf is not connected to the root) and its
    neighbors list in the graph [neighbors].
    '''
    # Initialization
    # Adding a distance in the entry graph
    bfs_graph = graph.map(lambda x: (x[0], (float('inf'), x[1])))
    # WARNING: the value need to be a tuple
    current = bfs_graph.filter(lambda x: x[0] == root)\
        .map(lambda x: (x[0], (0, x[1][1])))
    i = 0
    while True:
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        # Redundancy may happen, hence the use of distinct()
        visiting = current.flatMap(lambda x: [(n, (x[1][0] + 1, 1))
                                   for n in list(x[1][1])]).distinct()
        # Updating the current graph
        bfs_graph = bfs_graph.union(visiting).groupByKey()\
            .map(lambda x: (x[0], (min([v[0] for v in list(x[1])]),
                 list(x[1])[0][1])))
        # Need to update the indices before using it in spark
        i += 1
        # Getting the actual visiting nodes (not previously visited)
        current = bfs_graph.filter(lambda x: x[1][0] == i)
        # Check that nodes are left
        if not current.count():
            break
    return bfs_graph


In [7]:
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']

In [9]:
bfs = ss_bfs3(graph, roots[0])

In [16]:
import time
start = time.time()
short = bfs.filter(lambda x: x[1][0] < float('inf'))
sh = short.collect()
print len(sh)
end = time.time() - start

6408


In [18]:
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']
for root in roots:
    %time bfs = ss_bfs3(graph, root)
    %time distance = bfs.filter(lambda x: x[1][0] < float('inf'))
    %time num_node_visited = len(distance.collect())
    # Substract the root
    print('{} nodes visited for the character {}'.format(num_node_visited - 1,
          root))


CPU times: user 129 ms, sys: 28.1 ms, total: 157 ms
Wall time: 7.92 s
CPU times: user 31 µs, sys: 2 µs, total: 33 µs
Wall time: 42 µs
CPU times: user 230 ms, sys: 26.9 ms, total: 257 ms
Wall time: 945 ms
6407 nodes visited for the character CAPTAIN AMERICA
CPU times: user 81.8 ms, sys: 16.3 ms, total: 98 ms
Wall time: 4.1 s
CPU times: user 51 µs, sys: 3 µs, total: 54 µs
Wall time: 61 µs
CPU times: user 9.13 ms, sys: 2.01 ms, total: 11.1 ms
Wall time: 386 ms
6 nodes visited for the character MISS THING/MARY
CPU times: user 94.4 ms, sys: 16.9 ms, total: 111 ms
Wall time: 4.33 s
CPU times: user 48 µs, sys: 2 µs, total: 50 µs
Wall time: 58.9 µs
CPU times: user 8.64 ms, sys: 2.13 ms, total: 10.8 ms
Wall time: 333 ms
8 nodes visited for the character ORWELL


In [21]:
def connected_components(graph):
    '''
    Computes the number of connected components in the graph.
    Return (N, top) with N the number of connected components and top the
    number of nodes in the largest connected component.
    '''
    # Copy the graph
    current = graph.filter(lambda x: True)
    # Debugg
    print('current subgraph count is {}'.format(current.count()))

    # Initialization of the return variables
    N = 0
    top = 0

    while current.count():
        root = current.keys().take(1)[0]
        # Debugg
        print('current root is {}'.format(root))
        # Computing next connected component
        graph_with_component = ss_bfs3(current, root)
        component = graph_with_component\
                  .filter(lambda x: x[1][0] < float('inf'))
        # Updating return
        N += 1
        m = component.count()
        top = max(top, m)
        # Debugg
        print('current component count is {}'.format(m))

        # Filtering the visited nodes: filter return false for the node x in
        # the component subgraph
        current = graph_with_component.filter(lambda x: x[1][0] == float('inf'))
        # Debugg
        print('current subgraph count is {}'.format(current.count()))

    return (N, top)

In [22]:
%time comp = connected_components(graph)

current subgraph count is 6426
current root is [u'QUESADA, JOE']
current component count is 0
current subgraph count is 6426
current root is [u'QUESADA, JOE']
current component count is 0
current subgraph count is 6426
current root is [u'QUESADA, JOE']
current component count is 0
current subgraph count is 6426
current root is [u'QUESADA, JOE']
current component count is 0
current subgraph count is 6426
current root is [u'QUESADA, JOE']


KeyboardInterrupt: 