In [2]:
# References:
# http://hadooptutorial.wikispaces.com/Iterative+MapReduce+and+Counters
# http://www.slideshare.net/jhammerb/lec5-pagerank
# The CSV and Spark manual pages
import csv
import time

INF = 999999999
issues = {}
charLookup = {}
revLookup = []
adjList = {}
heroCount = 0
numPart = 2
updates = sc.accumulator(0)

# Read CSV file
with open('source.csv','rb') as f:
    reader = csv.reader(f)
    for row in reader:
        name, issue = row[0].strip(), row[1].strip() 
        print name, issue
        if issue not in issues:
            issues[issue] = set()
        if name not in charLookup:
            charLookup[name] = heroCount
            revLookup.append(name)
            heroCount += 1
        issues[issue].add(name)

# Create adjacency list
for cs in issues.values():
    chars = [charLookup[c] for c in cs]
    for i in xrange(len(chars)):
        c1 = chars[i]
        if c1 not in adjList:
            adjList[c1] = set()
        for j in xrange(i+1, len(chars)):
            c2 = chars[j]
            if c2 not in adjList:
                adjList[c2] = set()
            adjList[c1].add(c2)
            adjList[c2].add(c1)

print heroCount
            
# Map: emit neighbour nodes if GRAY, otherwise
# just emit the same thing if WHITE or BLACK
def processNode((k,v)):
    curDist, adj, col = v
    ret = []
    if col == 1: # Emit all neighbouring nodes and cur
        for nb in adj:
            ret.append( (nb, (curDist+1, [], 1)) )        
        ret.append((k, (curDist, adj, 2)))
    else: # Emit the same thing
        ret.append((k,v))
    return ret

# Combine nodes by key
def combineNode(a, b):    
    retAdj = a[1] if len(a[1]) > len(b[1]) else b[1]
    retDist = min(a[0], b[0])
    retC = max(a[2], b[2]) # Pick the higher color

    return (retDist, retAdj, retC)      

# Format of the RDD:
# node_index | current_dist | adj_list | color
# color = {WHITE=0, GRAY=1, BLACK=2}
def nTouched(graphRDD, startN = 0):
    
    graphRDD = graphRDD.map(lambda (k,v): (k,v) if (k!=startN) else (k, (0, v[1], 1)), preservesPartitioning=True)
    
    while True: # While gray nodes still exist 
        updates.value = 0
        graphRDD = graphRDD.flatMap(processNode)\
                        .reduceByKey(combineNode)
        
        graphRDD.foreach(lambda (k,v): updates.add(int(v[2]==1)))      
        if updates.value == 0:
            break        
    
    touched = graphRDD.filter(lambda (k,v): (v[0]!=INF))\
                        .count()

    return touched


rddList = []
    
for node, adj in adjList.iteritems():    
    dist = INF
    rddList.append( (node, (dist, list(adj), 0) ) )

graphRDD = sc.parallelize(rddList, numPart)\
                .cache()

sT = time.time()
sList = ['CAPTAIN AMERICA', 'MISS THING/MARY', 'ORWELL']
for char in sList:
    if char not in charLookup:
        print char,'not found!'
        continue
    print 'Source =', char
    print 'Touched', nTouched(graphRDD, charLookup[char]),'nodes'
print time.time() - sT, 'seconds'



FROST, CARMILLA AA2 35
KILLRAVEN/JONATHAN R AA2 35
M'SHULLA AA2 35
24-HOUR MAN/EMMANUEL AA2 35
OLD SKULL AA2 35
G'RATH AA2 35
3-D MAN/CHARLES CHAN M/PRM 35
3-D MAN/CHARLES CHAN M/PRM 36
3-D MAN/CHARLES CHAN M/PRM 37
HUMAN ROBOT WI? 9
MARVEL BOY III/ROBER WI? 9
GORILLA-MAN WI? 9
3-D MAN/CHARLES CHAN WI? 9
VENUS II WI? 9
HUMAN ROBOT AVF 4
GORILLA-MAN AVF 4
JONES, RICHARD MILHO AVF 4
3-D MAN/CHARLES CHAN AVF 4
WASP/JANET VAN DYNE AVF 4
LIBRA/GUSTAV BRANDT AVF 4
CAPTAIN AMERICA AVF 4
VENUS II AVF 4
HAWK AVF 4
ANT-MAN/DR. HENRY J. AVF 4
MARVEL BOY III/ROBER AVF 4
MARVEL BOY III/ROBER AVF 5
VENUS II AVF 5
JONES, RICHARD MILHO AVF 5
PHARAOH RAMA-TUT AVF 5
GORILLA-MAN AVF 5
HUMAN ROBOT AVF 5
CAPTAIN AMERICA AVF 5
HAWK AVF 5
3-D MAN/CHARLES CHAN AVF 5
ANT-MAN/DR. HENRY J. AVF 5
WASP/JANET VAN DYNE AVF 5
3-D MAN/CHARLES CHAN H2 251
BANNER, BETTY ROSS T H2 251
SLOAN, FRED H2 251
DOC SAMSON/DR. LEONA H2 251
ROSS, GEN. THADDEUS H2 251
JONES, RICHARD MILHO H2 251
WOODGOD H2 251
HULK/DR. ROBERT BRUC 