In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import seaborn as sns

In [5]:
sns.set_context('poster', font_scale=1.25)

In [6]:
import findspark as fs

In [7]:
fs.init()

In [8]:
import pyspark as ps

In [9]:
import multiprocessing as mp

In [10]:
mp.cpu_count()

12

In [11]:
config = ps.SparkConf()
config = config.setMaster('local[' + str(2*mp.cpu_count()) + ']')
config = config.setAppName('marvel_solver')

In [12]:
sc = ps.SparkContext(conf=config)

# Graph work

In [13]:
marvel_raw_data = sc.textFile('marvel_data.csv')

In [14]:
marvel_raw_data.take(10)

[u'"FROST, CARMILLA","AA2 35"',
 u'"KILLRAVEN/JONATHAN R","AA2 35"',
 u'"M\'SHULLA","AA2 35"',
 u'"24-HOUR MAN/EMMANUEL","AA2 35"',
 u'"OLD SKULL","AA2 35"',
 u'"G\'RATH","AA2 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 36"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 37"',
 u'"HUMAN ROBOT","WI? 9"']

In [15]:
def get_issue_character(raw_line):
    dat = raw_line.split('"')
    character_name = dat[1]
    issue = dat[3]
    return (issue, character_name)

In [16]:
issue_character_rdd = marvel_raw_data.map(get_issue_character)

Links are symmetric. We can just create key/value pairs from this. We want character as a key, values as everyone they link to. If one character links to another, the other character *must* link to them!

In [17]:
issue_groups_rdd = issue_character_rdd.groupByKey()

In [18]:
muffin = [(2,3),(4,5)]

In [19]:
def get_links_from_groups(x):
    list_of_characters = list(x[1])
    links = []
    for cur_character in list_of_characters:
        for other_character in list_of_characters:
            if cur_character != other_character:
                # Ensure that links are symmetric
                links.append((cur_character, other_character))
                links.append((other_character, cur_character))
    return tuple(links)

In [20]:
all_links_rdd = issue_groups_rdd.flatMap(get_links_from_groups)

We now groupby key again. And then we have all the links for a given individual. *But* we have to make sure they are unique.

In [21]:
all_links_rdd.take(2)

[(u'PRINCESS ZANDA', u"BLACK PANTHER/T'CHAL"),
 (u"BLACK PANTHER/T'CHAL", u'PRINCESS ZANDA')]

In [22]:
character_and_links_rdd = all_links_rdd.groupByKey()

In [23]:
def cleanup_links(x):
    linked_to = list(x[1])
    unique_links = tuple(set(linked_to))
    return (x[0], unique_links)

In [24]:
character_and_links_rdd.take(1)

[(u'QUESADA, JOE', <pyspark.resultiterable.ResultIterable at 0x7f36b02f5f90>)]

In [25]:
network_rdd = character_and_links_rdd.map(cleanup_links)

Ok, awesome. We are all set.

In [26]:
network_result = network_rdd.collect()

In [27]:
network_result[0:5]

[(u'QUESADA, JOE',
  (u'ZURI',
   u'GLADIATOR/MELVIN POT',
   u'SMITH, KEVIN',
   u'OKOYE',
   u'PALMIOTTI, JIMMY',
   u'MARTINEZ, ALITHA',
   u'MYSTERIO/QUENTIN BEC',
   u'WATSON-PARKER, MARY ',
   u'SPIDER-MAN/PETER PAR',
   u'MCKENZIE, LYDIA',
   u'LAMY, KELLY',
   u'URICH, BEN',
   u"BLACK PANTHER/T'CHAL",
   u'MR. FANTASTIC/REED R',
   u'THING/BENJAMIN J. GR',
   u'QUINN, ASHLEY',
   u'CAPTAIN AMERICA',
   u'MALICE V/NAKIA',
   u'NELSON, CANDACE',
   u'INVISIBLE WOMAN/SUE ',
   u'BUTCH',
   u'EIGHTBALL',
   u'DAREDEVIL/MATT MURDO',
   u'ROSS, EVERETT KENNET',
   u'POTTER, BETSY BEATTY',
   u'BLACK WIDOW/NATASHA ',
   u'EVERETT, BILL',
   u'BLAKE, BECKY',
   u'DAKESIAN, NANCI',
   u'SHARPE, ROSALINDE',
   u'NELSON, FRANKLIN FOG',
   u'MILLER, FRANK',
   u'HUMAN TORCH/JOHNNY S',
   u'OSBORN, LIZ ALLAN',
   u'DR. STRANGE/STEPHEN ',
   u'DARLA',
   u'RALF',
   u'PAGE, KAREN',
   u'LEE, STAN')),
 (u'CRUSADER II/ARTHUR B',
  (u'HELA [ASGARDIAN]',
   u'X-MAN/NATHAN GREY ',
   u'HOGUN [AS

# First attempt

We now propagate.

In [60]:
num_to_propogate = 10

In [61]:
cur_time_point = 0

In [62]:
start_point = 'CAPTAIN AMERICA'
piece_of_rdd = network_rdd.filter(lambda x: x[0] == start_point)
touched_nodes = piece_of_rdd.map(lambda x: (x[0], (cur_time_point, x[1])))

In [63]:
get_neighbors_to_update = touched_nodes.flatMap(lambda x: x[1][1])

In [64]:
get_neighbors_to_update.take(5)

[u'QUESADA, JOE',
 u'SILVER SAMURAI/KENYU',
 u'CAPTAIN AMERICA IV/S',
 u'DREAMING CELESTIAL/T',
 u'VOGHT, AMELIA']

In [65]:
unique_neighbors_to_update = get_neighbors_to_update.distinct()

In [66]:
unique_neighbors_to_update.take(10)

[u'QUESADA, JOE',
 u'PARKER, MAY | TIMESL',
 u'SILVER SAMURAI/KENYU',
 u'STORM, CHILI',
 u'JESTER III',
 u'DREAMING CELESTIAL/T',
 u'VOGHT, AMELIA',
 u'DOPPELGANGER',
 u'SCHEMER/RICHARD FISK',
 u'KRO [DEVIANT]']

In [67]:
keys_to_update = unique_neighbors_to_update.map(lambda x: (x, cur_time_point + 1))

We now update. 

In [68]:
final_touched = keys_to_update.join(network_rdd)

In [69]:
final_touched.take(5)

[(u'QUESADA, JOE',
  (1,
   (u'ZURI',
    u'GLADIATOR/MELVIN POT',
    u'SMITH, KEVIN',
    u'OKOYE',
    u'PALMIOTTI, JIMMY',
    u'MARTINEZ, ALITHA',
    u'MYSTERIO/QUENTIN BEC',
    u'WATSON-PARKER, MARY ',
    u'SPIDER-MAN/PETER PAR',
    u'MCKENZIE, LYDIA',
    u'LAMY, KELLY',
    u'URICH, BEN',
    u"BLACK PANTHER/T'CHAL",
    u'MR. FANTASTIC/REED R',
    u'THING/BENJAMIN J. GR',
    u'QUINN, ASHLEY',
    u'CAPTAIN AMERICA',
    u'MALICE V/NAKIA',
    u'NELSON, CANDACE',
    u'INVISIBLE WOMAN/SUE ',
    u'BUTCH',
    u'EIGHTBALL',
    u'DAREDEVIL/MATT MURDO',
    u'ROSS, EVERETT KENNET',
    u'POTTER, BETSY BEATTY',
    u'BLACK WIDOW/NATASHA ',
    u'EVERETT, BILL',
    u'BLAKE, BECKY',
    u'DAKESIAN, NANCI',
    u'SHARPE, ROSALINDE',
    u'NELSON, FRANKLIN FOG',
    u'MILLER, FRANK',
    u'HUMAN TORCH/JOHNNY S',
    u'OSBORN, LIZ ALLAN',
    u'DR. STRANGE/STEPHEN ',
    u'DARLA',
    u'RALF',
    u'PAGE, KAREN',
    u'LEE, STAN'))),
 (u'SILVER SAMURAI/KENYU',
  (1,
   (u'DARKST

In [71]:
updated_touched_nodes = final_touched.union(touched_nodes)

We now put this in a function. I'll write out the function here to avoid problems...but actually it can't be put in a function. I'm so confused. 

In [93]:
num_to_propagate = 10

start_of_rdd = network_rdd.filter(lambda x: x[0] == start_node)
last_touched_nodes = start_of_rdd.map(lambda x: (x[0], (cur_time_point, x[1])))
distance_rdd = last_touched_nodes

nonvisited_neighbors = None
for cur_time in range(num_to_propagate):
    get_neighbors_to_update = last_touched_nodes.flatMap(lambda x: x[1][1])
    unique_neighbors_to_update = get_neighbors_to_update.distinct()
    if all_neighbors is None:
        nonvisited_neighbors = unique_neighbors_to_update
    else:
        nonvisited_neighbors = sc.union(unique_neighbors_to_update)
    keys_to_update =nonvisited_neighbors.map(lambda x: (x, cur_time_point + 1))

    # Here is the clincher...if you get the same person again, you 
    # *cannot* update them again!

    last_touched_nodes = keys_to_update.join(network_rdd)
    distance_rdd = final_touched.union(touched_nodes)

I don't think this is going to work very well...will it? IDK. No. Very confused. Not sure how to nest commands in spark...

In [None]:
def propagate_from_node(start_node, num_to_propagate=10):
    
    