In [1]:
import findspark
findspark.init('/home/shenjeffrey/spark/')
import pyspark
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import seaborn as sns

In [3]:
# initiate spark
sc = pyspark.SparkContext()

In [4]:
# Read in data
data = sc.textFile("source.csv")
data.take(10)

[u'"FROST, CARMILLA","AA2 35"',
 u'"KILLRAVEN/JONATHAN R","AA2 35"',
 u'"M\'SHULLA","AA2 35"',
 u'"24-HOUR MAN/EMMANUEL","AA2 35"',
 u'"OLD SKULL","AA2 35"',
 u'"G\'RATH","AA2 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 36"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 37"',
 u'"HUMAN ROBOT","WI? 9"']

In [5]:
def clean_csv(line):
    line = line.split('","')
    line[0] = line[0].replace('"', '')
    line[1] = line[1].replace('"', '')
    return line

In [6]:
clean_csv('"FROST, CARMILLA","AA2 35"')

['FROST, CARMILLA', 'AA2 35']

In [7]:
# Clean data 
# (Comic Book, Comic Book Character)
clean_data = data.map(clean_csv).map(lambda x: (x[1], x[0]))
clean_data.take(10)

[(u'AA2 35', u'FROST, CARMILLA'),
 (u'AA2 35', u'KILLRAVEN/JONATHAN R'),
 (u'AA2 35', u"M'SHULLA"),
 (u'AA2 35', u'24-HOUR MAN/EMMANUEL'),
 (u'AA2 35', u'OLD SKULL'),
 (u'AA2 35', u"G'RATH"),
 (u'M/PRM 35', u'3-D MAN/CHARLES CHAN'),
 (u'M/PRM 36', u'3-D MAN/CHARLES CHAN'),
 (u'M/PRM 37', u'3-D MAN/CHARLES CHAN'),
 (u'WI? 9', u'HUMAN ROBOT')]

In [8]:
# Create symmetric edges for a bidirectional graph
# 1) Inner join on Comic Book (this guarantees A-A, A-B, B-A, B-B edges)
# 2) Delete all duplicated pairs (delete A-A and B-B to get A-B and B-A edges)
edges = clean_data.join(clean_data)


In [9]:
edges.take(5)

[(u'W2 44', (u'WOLVERINE/LOGAN ', u'WOLVERINE/LOGAN ')),
 (u'M/CP 103/4', (u'DR. STRANGE/STEPHEN ', u'DR. STRANGE/STEPHEN ')),
 (u'M/CP 103/4', (u'DR. STRANGE/STEPHEN ', u'WOLFE, SARA')),
 (u'M/CP 103/4', (u'DR. STRANGE/STEPHEN ', u'RINTRAH')),
 (u'M/CP 103/4', (u'WOLFE, SARA', u'DR. STRANGE/STEPHEN '))]

In [10]:
# 2) Delete all duplicated pairs (delete A-A and B-B to get A-B and B-A edges)
edges = edges.filter(lambda(book, char): char[0] != char[1])
edges = edges.map(lambda(book, char_edge): char_edge)

In [16]:
edges.count()

1144470

In [11]:
edges.take(10)

[(u'DR. STRANGE/STEPHEN ', u'WOLFE, SARA'),
 (u'DR. STRANGE/STEPHEN ', u'RINTRAH'),
 (u'WOLFE, SARA', u'DR. STRANGE/STEPHEN '),
 (u'WOLFE, SARA', u'RINTRAH'),
 (u'RINTRAH', u'DR. STRANGE/STEPHEN '),
 (u'RINTRAH', u'WOLFE, SARA'),
 (u'SPIDER-MAN/PETER PAR', u'VENOM/EDDIE BROCK'),
 (u'SPIDER-MAN/PETER PAR', u'RAINTREE, VIVIAN'),
 (u'VENOM/EDDIE BROCK', u'SPIDER-MAN/PETER PAR'),
 (u'VENOM/EDDIE BROCK', u'RAINTREE, VIVIAN')]

In [12]:
# Create graph from edges
# GroupbyKey with Comic Book Character
graph = edges.groupByKey().mapValues(list)

In [17]:
graph.count()

6426

In [16]:
# Since b/c of lazy evaluation make the graph sort
graph = graph.sortByKey()

In [22]:
results = graph.collect()

In [21]:
graph.lookup('CAPTAIN AMERICA')

[[u'SPRITE',
  u'CYBELE [ETERNAL]',
  u'PHASTOS [ETERNAL]',
  u'KHORYPHOS [ETERNAL]',
  u'VOLSTAGG',
  u'LOKI [ASGARDIAN]',
  u'KARKAS [DEVIANT]',
  u'KINGO SUNEN [ETERNAL',
  u'HOGUN [ASGARDIAN]',
  u'SERSI/SYLVIA',
  u'REJECT/RAN-SAK [DEVI',
  u'SHE-HULK/JENNIFER WA',
  u'MAKKARI/MIKE KHARY/I',
  u'MUNIN',
  u'BLASTAAR',
  u'FANDRAL [ASGARDIAN]',
  u'SUB-MARINER/NAMOR MA',
  u'FORGOTTEN ONE/GILGAM',
  u'IKARIS/IKE HARRIS [E',
  u'VIZIER',
  u'HUGIN',
  u'THENA',
  u'ODIN [ASGARDIAN]',
  u'THOR/DR. DONALD BLAK',
  u'CARTER, SHARON',
  u'KLEIN, STANLEY',
  u'FURY, COL. NICHOLAS ',
  u'FALCON/SAM WILSON',
  u'KEMEL, MEHMET',
  u'DUGAN, TIMOTHY ALOYI',
  u'BRANSON',
  u'HATE-MONGER/ADOLF HI',
  u'BILLUPS, AGENT',
  u'KLEIN, SHIRLEY',
  u'JARVIS, EDWIN ',
  u'ALANYA',
  u'KLAW/ULYSSES KLAW',
  u'FERRARI, CONNIE',
  u'CARTER, SHARON',
  u'BUSIEK, KURT',
  u'CHAKARA, KANU',
  u'IRON MAN/TONY STARK ',
  u'RAMIREZ, JAVIER',
  u'ANT-MAN/DR. HENRY J.',
  u'WASP/JANET VAN DYNE ',
  u'DAVID, PETE

In [13]:
def shortest_path(graph, root_node, iteration):
    queue = [root_node]
    traversed_nodes = {} # dict to store traversed nodes and its distance from root_node
    # For loop through iterations
    for i in xrange(0, iteration+1):
        neighbor = []
        # Loop through each node in the queue
        for node in queue:
            # When we encounter a new node
            if node not in traversed_nodes:
                # Node is new and set the distance
                traversed_nodes[node] = i
                # Add in the new node's neighbors
#                 print graph.lookup(node)
                # Add the new node's neighboring nodes to adj neighbors
                neighbor = neighbor + graph.lookup(node)[0]
        # Refresh queue
        queue = neighbor
    # Result
    result = graph.map(lambda (k,v): (k, v, traversed_nodes[k]) if k in traversed_nodes else (k, v, -1))
    # Number of nodes touched excluding the root note
    num_nodes = len(traversed_nodes) - 1 # -1 for excluding the root node
    print "Num of nodes touched: ", num_nodes
    return num_nodes, result

In [14]:
orwell = shortest_path(graph, "ORWELL", 10)

Num of nodes touched:  8


In [15]:
miss_thing = shortest_path(graph, "MISS THING/MARY", 10)

Num of nodes touched:  6


In [31]:
captain_america = shortest_path(graph, "CAPTAIN AMERICA", 10)

Num of nodes touched:  6407
