In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
sc = pyspark.SparkContext(appName="Spark1")

In [3]:
# Parameters
partitions = 4

# (K, V) is (issue, Character)
comics = sc.textFile('source.csv').map(lambda line: (line.split('"')[3],
                                       line.split('"')[1]))

# Join version
edges = comics.join(comics).map(lambda x: (x[1][0],
                                x[1][1])).filter(lambda x: x[0] != x[1])
graph = edges.groupByKey().map(lambda x: (x[0], set(list(x[1]))))\
    .partitionBy(partitions).cache()

In [4]:
graph.take(1)

[(u'QUESADA, JOE',
  {u"BLACK PANTHER/T'CHAL",
   u'BLACK WIDOW/NATASHA ',
   u'BLAKE, BECKY',
   u'BUTCH',
   u'CAPTAIN AMERICA',
   u'DAKESIAN, NANCI',
   u'DAREDEVIL/MATT MURDO',
   u'DARLA',
   u'DR. STRANGE/STEPHEN ',
   u'EIGHTBALL',
   u'EVERETT, BILL',
   u'GLADIATOR/MELVIN POT',
   u'HUMAN TORCH/JOHNNY S',
   u'INVISIBLE WOMAN/SUE ',
   u'LAMY, KELLY',
   u'LEE, STAN',
   u'MALICE V/NAKIA',
   u'MARTINEZ, ALITHA',
   u'MCKENZIE, LYDIA',
   u'MILLER, FRANK',
   u'MR. FANTASTIC/REED R',
   u'MYSTERIO/QUENTIN BEC',
   u'NELSON, CANDACE',
   u'NELSON, FRANKLIN FOG',
   u'OKOYE',
   u'OSBORN, LIZ ALLAN',
   u'PAGE, KAREN',
   u'PALMIOTTI, JIMMY',
   u'POTTER, BETSY BEATTY',
   u'QUINN, ASHLEY',
   u'RALF',
   u'ROSS, EVERETT KENNET',
   u'SHARPE, ROSALINDE',
   u'SMITH, KEVIN',
   u'SPIDER-MAN/PETER PAR',
   u'THING/BENJAMIN J. GR',
   u'URICH, BEN',
   u'WATSON-PARKER, MARY ',
   u'ZURI'})]

In [37]:
# Version 3
# SS-BFS Algorithm: only one rdd without collecting/diameter
def ss_bfs3(bfs_graph, root):
    '''
    Return a (K, (d, [neighbors])) rdd with the nodes as keys and d distance
    from the root as value (d = inf is not connected to the root) and its
    neighbors list in the graph [neighbors].
    Entry graph should be formated as follows:
    (K, (d, [neighbors])), with eventually d = inf for all the nodes if first
    call to ss_bfs3
    '''
    # WARNING: the value need to be a tuple
    current = bfs_graph.filter(lambda x: x[0] == root)\
        .map(lambda x: (x[0], (0, x[1][1])))
    i = 0
    while True:
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        # Redundancy may happen, hence the use of distinct()
        visiting = current.flatMap(lambda x: [(n, (x[1][0] + 1, 1))
                                   for n in list(x[1][1])]).distinct()
        # Updating the current graph
        bfs_graph = bfs_graph.union(visiting).groupByKey()\
            .map(lambda x: (x[0], (min([v[0] for v in list(x[1])]),
                 list(x[1])[0][1])))
        # Need to update the indices before using it in spark
        i += 1
        # Getting the actual visiting nodes (not previously visited)
        current = bfs_graph.filter(lambda x: x[1][0] == i)
        # Check that nodes are left
        if not current.count():
            break
    return bfs_graph

In [31]:
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']

In [6]:
# Version3
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']
# Initialization
# Adding a distance in the entry graph
bfs_graph = graph.mapValues(lambda v: (float('inf'), v))
for root in roots:
    %time bfs = ss_bfs3(bfs_graph, root)
    distance = bfs.filter(lambda x: x[1][0] < float('inf'))
    num_node_visited = distance.count()
    # Substract the root
    print('{} nodes visited for the character {}'.format(num_node_visited - 1,
          root))



CPU times: user 118 ms, sys: 21.2 ms, total: 139 ms
Wall time: 7.8 s
6407 nodes visited for the character CAPTAIN AMERICA
CPU times: user 73 ms, sys: 12.2 ms, total: 85.2 ms
Wall time: 3.78 s
6 nodes visited for the character MISS THING/MARY
CPU times: user 80.3 ms, sys: 14.8 ms, total: 95.1 ms
Wall time: 3.87 s
8 nodes visited for the character ORWELL


In [7]:
# Version 2
# SS-BFS Algorithm: without collecting, create a second rdd
def ss_bfs2(graph, root, partitions=4):
    '''
    Return a (K, V) rdd with the visited nodes as keys and their distance from
    the root as value.
    Computation is slow because of the collect() call at each stage.
    '''
    # Initialization
    # WARNING: the value need to be a tuple
    current = graph.filter(lambda x: x[0] == root)\
        .mapValues(lambda v: (0, v))
    distance = sc.parallelize([(root, 0)])
    i = 0
    while True:
        # Visiting the neighbors of the Nodes in the subgraph current and
        # updating their distance.
        # Redundancy may happen, hence the use of distinct()
        visiting = current.flatMap(lambda x: [(n, x[1][0] + 1)
                                   for n in list(x[1][1])]).distinct()\
            .partitionBy(partitions)
        distance = visiting.union(distance).groupByKey()\
            .mapValues(lambda v: min([d for d in list(v)]))\
            .partitionBy(partitions)
        # Getting the actual visiting nodes (not previously visited)
        visiting_ = distance.filter(lambda x: x[1] == i + 1)
        # Check that nodes are left
        if not visiting_.count():
            break
        # Getting the list of the neighbors of the current nodes as values.
        current = visiting_.join(graph)
        i += 1
    return distance
    

In [8]:
# Version2
roots = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']
for root in roots:
    %time distance = ss_bfs2(graph, root)
    num_node_visited = distance.count()
    # Substract the root
    print('{} nodes visited for the character {}'.format(num_node_visited - 1,
          root))



CPU times: user 167 ms, sys: 27.5 ms, total: 194 ms
Wall time: 3.53 s
6407 nodes visited for the character CAPTAIN AMERICA
CPU times: user 59.6 ms, sys: 11.4 ms, total: 71 ms
Wall time: 952 ms
6 nodes visited for the character MISS THING/MARY
CPU times: user 78.5 ms, sys: 13.6 ms, total: 92.1 ms
Wall time: 1.05 s
8 nodes visited for the character ORWELL


In [41]:
# ## Part B
def connected_components(graph):
    '''
    Computes the number of connected components in the graph.
    Return (N, top) with N the number of connected components and top the
    number of nodes in the largest connected component.
    '''
    # Initialization of the graph: formatting it to call ss_bfs3
    current = graph.map(lambda x: (x[0], (float('inf'), x[1])))

    # Initialization of the return variables
    N = 0
    top = 0

    while current.count():
        root = current.keys().take(1)[0]
        # Debugg
        print('current root is {}'.format(root))
        # Computing next connected component
        graph_with_component = ss_bfs3(current, root)
        # Updating return
        m = graph_with_component.filter(lambda x: x[1][0] < float('inf'))\
            .count()
        N += 1
        top = max(top, m)
        # Debugg
        print('current component count is {}'.format(m))

        # Filtering the visited nodes: filter return false for the node x in
        # the component subgraph
        # Call cache() to persist this stage in cache
        current = graph_with_component\
            .filter(lambda x: x[1][0] == float('inf')).cache()
        # Debugg
        print('current subgraph count is {}'.format(current.count()))

    return (N, top)

In [9]:
def connected_components(graph):
    '''
    Computes the number of connected components in the graph.
    Return (N, top) with N the number of connected components and top the
    number of nodes in the largest connected component.
    '''
    # Initialization of the graph: formatting it to call ss_bfs3
    current = graph.mapValues(lambda v: (float('inf'), v))

    # Initialization of the return variables
    N = 0
    top = 0

    while current.count():
        root = current.keys().take(1)[0]
        # Debugg
        print('current root is {}'.format(root))
        # Computing next connected component
        graph_with_component = ss_bfs3(current, root)
        # Updating return
        m = graph_with_component.filter(lambda x: x[1][0] < float('inf'))\
            .count()
        N += 1
        top = max(top, m)
        # Debugg
        print('current component count is {}'.format(m))

        # Filtering the visited nodes: filter return false for the node x in
        # the component subgraph
        # Call cache() to persist this stage in cache
        current = graph_with_component\
            .filter(lambda x: x[1][0] == float('inf')).cache()
        # Debugg
        print('current subgraph count is {}'.format(current.count()))

    return (N, top)


In [10]:
%time comp = connected_components(graph)

current root is QUESADA, JOE
current component count is 6408
current subgraph count is 18
current root is LUDLUM, ROSS
current component count is 9
current subgraph count is 9
current root is STERLING
current component count is 7
current subgraph count is 2
current root is MASTER OF VENGEANCE
current component count is 2
current subgraph count is 0
CPU times: user 582 ms, sys: 94.2 ms, total: 676 ms
Wall time: 18.9 s


In [11]:
comp

(4, 6408)

In [36]:
bfs_graph = bfs_graph.union(visiting).groupByKey()\
            .map(lambda x: (x[0], (min([v[0] for v in list(x[1])]),
                 list(x[1])[0][1])))

In [39]:
i = 1

In [40]:
current1 = bfs_graph.filter(lambda x: x[1][0] == i)

In [33]:
current.count()

1

In [21]:
min(3,4)

3

In [36]:
[3,4] + []

[3, 4]