In [1]:
import itertools

In [2]:
import findspark
import os
findspark.init()
import pyspark
sc = pyspark.SparkContext()
sc.setLogLevel("ERROR")

### Load the graph

In [3]:
clist = sc.textFile('source.csv')

In [4]:
# print clist
# print clist.getNumPartitions()
# print clist.count()
# print clist.take(5)

In [5]:
def split_csv(val):
    split = val.split(',')
    name = ','.join(split[:-1]).replace('"','').strip()
    comic = split[-1].replace('"','')
    return tuple([comic, name])

In [6]:
clist_split = clist.map(lambda x: split_csv(x)).distinct()

In [7]:
# print clist_split
# print clist_split.getNumPartitions()
# print clist_split.count()
# print clist_split.take(5)

In [8]:
comic_allchars = clist_split.groupByKey().mapValues(list)

In [9]:
# print comic_allchars
# print comic_allchars.getNumPartitions()
# print comic_allchars.count()
# print comic_allchars.take(1)

In [10]:
def perm_chars(val):
    return list(itertools.permutations(val[1], 2))

In [11]:
char_allchars = comic_allchars.flatMap(lambda x: perm_chars(x)).distinct().groupByKey().mapValues(list)

In [12]:
# print char_allchars
# print char_allchars.getNumPartitions()
# print char_allchars.count()
# print char_allchars.take(1)

### Search the graph

**Original function:**

In [23]:
def bfss(charGraph, startNode, diameter):
    
    dist = 0
    searchNodes = sc.parallelize([startNode]).map(lambda x: (x, 0))
    frontier = sc.parallelize([startNode]).map(lambda x: (x, 0))
    
    while dist < diameter and searchNodes.count()>0:
        dist += 1
        children = charGraph.join(searchNodes).flatMap(lambda x: x[1][0]).map(lambda x: (x, dist))
        frontier = frontier.union(children).reduceByKey(min).cache()
        searchNodes = frontier.filter(lambda x: x[1]==dist).cache()
        
    return frontier.count()

In [14]:
char = 'CAPTAIN AMERICA'
%time print char, bfss(char_allchars, char, 10)

char = 'MISS THING/MARY'
%time print char, bfss(char_allchars, char, 10)

char = 'ORWELL'
%time print char, bfss(char_allchars, char, 10)

CAPTAIN AMERICA 6403
CPU times: user 233 ms, sys: 51.7 ms, total: 285 ms
Wall time: 20.9 s
MISS THING/MARY 7
CPU times: user 104 ms, sys: 19.9 ms, total: 124 ms
Wall time: 3.21 s
ORWELL 9
CPU times: user 99 ms, sys: 20.5 ms, total: 120 ms
Wall time: 2.97 s


**Optimized function:**

In [15]:
def nodes_update(x, explored):
    explored.add(1)
    return x

In [16]:
def bfssOpt(charGraph, startNode):
    
    dist = 0
    searchNodes = sc.parallelize([startNode]).map(lambda x: (x, 0))
    frontier = sc.parallelize([startNode]).map(lambda x: (x, 0)).partitionBy(20)
    explored = sc.accumulator(1)
    
    while not searchNodes.isEmpty():
        dist += 1
        children = charGraph.join(searchNodes).flatMap(lambda x: x[1][0]).map(lambda x: (x, dist)).partitionBy(20)
        frontier = frontier.union(children).reduceByKey(min).partitionBy(20).cache()
        searchNodes = frontier.filter(lambda x: x[1]==dist).map(lambda x: nodes_update(x, explored)).partitionBy(20).cache()
        
    return explored.value

In [18]:
char = 'CAPTAIN AMERICA'
%time print char, bfssOpt(char_allchars, char)

char = 'MISS THING/MARY'
%time print char, bfssOpt(char_allchars, char)

char = 'ORWELL'
%time print char, bfssOpt(char_allchars, char)

CAPTAIN AMERICA 6403
CPU times: user 190 ms, sys: 42.3 ms, total: 233 ms
Wall time: 7.41 s
MISS THING/MARY 7
CPU times: user 109 ms, sys: 23.7 ms, total: 133 ms
Wall time: 2.97 s
ORWELL 9
CPU times: user 116 ms, sys: 25.3 ms, total: 141 ms
Wall time: 2.9 s
