In [56]:
import findspark
findspark.init()
import pyspark
import json
sc = pyspark.SparkContext(appName="")
# from P4_bfs import bfs

In [182]:
## Import the character-comic pair.
source = sc.textFile('source.csv')

## For each comic (key), group the comic's characters into a list, then remove the comic key.
def split_raw(x):
    x1 = x.strip('"').split('","')
    if len(x1)==2:
        return (x1[1], x1[0])
source1 = source.map(split_raw).groupByKey().map(lambda x: list(x[1]))
## We are now left with lists of characters in each comic.

## In each list of characters, use each character as the key to create a list of character's neighbors.
## Then combine different neighbor lists for each character.
def comic_to_neighbor(vlist):
    neighbor = []
    for i in range(len(vlist)):
        neighbor.append((vlist[i], vlist[:i] + vlist[i+1:]))
    return neighbor
source2 = source1.flatMap(comic_to_neighbor).reduceByKey(lambda x,y: list(set(x+y))).map(lambda x: (x[0], (1000, x[1]))).cache()
## Now we have (k, v) for k=character and v=neighbors.

In [191]:
def bfs(sourceRDD, root, step_limit = False):
    ## Set the root
    def set_root((node, (dist, neighbors))):
        if (node == root):
            return (node, (0, neighbors))
        else:
            return ((node, (dist, neighbors)))

    ## Update the node RDD: character, (distance, [list of neighbors])
    ## First we find the current node's neighbors and create a row: neighbor, (current_distance + 1, [empty list])
    def update_node(inputRDD, step, visit_count):
        def find_neighbor((node, (dist, neighbors))):
            next_list = [(node, (dist, neighbors))]
            if dist == step:
                for neighbor in neighbors:
                    next_list.append((neighbor, (step+1, [])))
            return next_list
    ## Then we combine the newly created neighbor with the old neighbor with the updated distance
        outputRDD = inputRDD.flatMap(find_neighbor).reduceByKey(lambda (x1,y1), (x2,y2): (min(x1,x2), y1+y2))
        return outputRDD    

    ## Initialize root and count variables
    nodeRDD = sourceRDD.map((set_root))
    new_count = 1
    visit_count = sc.accumulator(1)
    print root

    ## Depending on if the number of searches is limited, call the bfs multiple times and update the count variables
    if step_limit != False:
        for step in range(step_limit):
            nodeRDD = update_node(nodeRDD, step, visit_count)
            new_count = nodeRDD.filter(lambda (k, v): v[0] == step+1).count()
            visit_count += new_count        
            print "Step: ", (step+1), "; New Nodes: ", new_count, "; Total Nodes: ", visit_count, "."
        return (step+1, visit_count, nodeRDD)
    else:
        step = 0
        while new_count > 0:
            nodeRDD = update_node(nodeRDD, step, visit_count)
            new_count = nodeRDD.filter(lambda (k, v): v[0] == step+1).count()
            visit_count += new_count
            print "Step: ", (step+1), "; New Nodes: ", new_count, "; Total Nodes: ", visit_count, "."
            if new_count == 0:
                return (step+1, visit_count, nodeRDD)
            step += 1

In [192]:
result1 = bfs(source2, 'CAPTAIN AMERICA')
result2 = bfs(source2, 'MISS THING/MARY')
result3 = bfs(source2, 'ORWELL')

CAPTAIN AMERICA
Step:  1 ; New Nodes:  1906 ; Total Nodes:  1907 .
Step:  2 ; New Nodes:  4463 ; Total Nodes:  6370 .
Step:  3 ; New Nodes:  38 ; Total Nodes:  6408 .
Step:  4 ; New Nodes:  0 ; Total Nodes:  6408 .
MISS THING/MARY
Step:  1 ; New Nodes:  6 ; Total Nodes:  7 .
Step:  2 ; New Nodes:  0 ; Total Nodes:  7 .
ORWELL
Step:  1 ; New Nodes:  8 ; Total Nodes:  9 .
Step:  2 ; New Nodes:  0 ; Total Nodes:  9 .
