In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import seaborn as sns

In [5]:
sns.set_context('poster', font_scale=1.25)

In [6]:
import findspark as fs

In [7]:
fs.init()

In [8]:
import pyspark as ps

In [9]:
import multiprocessing as mp

In [10]:
mp.cpu_count()

4

In [11]:
config = ps.SparkConf()
config = config.setMaster('local[' + str(2*mp.cpu_count()) + ']')
config = config.setAppName('marvel_solver')

In [12]:
sc = ps.SparkContext(conf=config)

# Testing caching

In [13]:
muffin = sc.parallelize(range(100))

In [14]:
muffin.cache()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:391

This does not do what we want sadly. We'll have to just apply a function every iteration and collect. Or else it's not gonna work.

In [15]:
muffin.cache()

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:391

In [16]:
wow = muffin.collect()

In [17]:
wow

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]

# Graph work

In [18]:
marvel_raw_data = sc.textFile('marvel_data.csv')

In [19]:
marvel_raw_data.take(10)

[u'"FROST, CARMILLA","AA2 35"',
 u'"KILLRAVEN/JONATHAN R","AA2 35"',
 u'"M\'SHULLA","AA2 35"',
 u'"24-HOUR MAN/EMMANUEL","AA2 35"',
 u'"OLD SKULL","AA2 35"',
 u'"G\'RATH","AA2 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 35"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 36"',
 u'"3-D MAN/CHARLES CHAN","M/PRM 37"',
 u'"HUMAN ROBOT","WI? 9"']

In [20]:
def get_issue_character(raw_line):
    dat = raw_line.split('"')
    character_name = dat[1]
    issue = dat[3]
    return (issue, character_name)

In [21]:
issue_character_rdd = marvel_raw_data.map(get_issue_character)

Links are symmetric. We can just create key/value pairs from this. We want character as a key, values as everyone they link to. If one character links to another, the other character *must* link to them!

In [22]:
issue_groups_rdd = issue_character_rdd.groupByKey()

In [23]:
muffin = [(2,3),(4,5)]

In [24]:
def get_links_from_groups(x):
    list_of_characters = list(x[1])
    links = []
    for cur_character in list_of_characters:
        for other_character in list_of_characters:
            if cur_character != other_character:
                # Ensure that links are symmetric
                links.append((cur_character, other_character))
                links.append((other_character, cur_character))
    return tuple(links)

In [25]:
all_links_rdd = issue_groups_rdd.flatMap(get_links_from_groups)

We now groupby key again. And then we have all the links for a given individual. *But* we have to make sure they are unique.

In [26]:
all_links_rdd.take(2)

[(u'PRINCESS ZANDA', u"BLACK PANTHER/T'CHAL"),
 (u"BLACK PANTHER/T'CHAL", u'PRINCESS ZANDA')]

In [27]:
character_and_links_rdd = all_links_rdd.groupByKey()

In [28]:
def cleanup_links(x):
    linked_to = list(x[1])
    unique_links = tuple(set(linked_to))
    return (x[0], unique_links)

In [29]:
character_and_links_rdd.take(1)

[(u'QUESADA, JOE', <pyspark.resultiterable.ResultIterable at 0x7f1f2c17ef50>)]

In [30]:
network_rdd = character_and_links_rdd.map(cleanup_links)

Ok, awesome. We are all set.

In [31]:
network_result = network_rdd.collect()

In [32]:
network_result[0:5]

[(u'QUESADA, JOE',
  (u'ZURI',
   u'GLADIATOR/MELVIN POT',
   u'SMITH, KEVIN',
   u'OKOYE',
   u'PALMIOTTI, JIMMY',
   u'MARTINEZ, ALITHA',
   u'MYSTERIO/QUENTIN BEC',
   u'WATSON-PARKER, MARY ',
   u'SPIDER-MAN/PETER PAR',
   u'MCKENZIE, LYDIA',
   u'LAMY, KELLY',
   u'URICH, BEN',
   u"BLACK PANTHER/T'CHAL",
   u'MR. FANTASTIC/REED R',
   u'THING/BENJAMIN J. GR',
   u'QUINN, ASHLEY',
   u'CAPTAIN AMERICA',
   u'MALICE V/NAKIA',
   u'NELSON, CANDACE',
   u'INVISIBLE WOMAN/SUE ',
   u'BUTCH',
   u'EIGHTBALL',
   u'DAREDEVIL/MATT MURDO',
   u'ROSS, EVERETT KENNET',
   u'POTTER, BETSY BEATTY',
   u'BLACK WIDOW/NATASHA ',
   u'EVERETT, BILL',
   u'BLAKE, BECKY',
   u'DAKESIAN, NANCI',
   u'SHARPE, ROSALINDE',
   u'NELSON, FRANKLIN FOG',
   u'MILLER, FRANK',
   u'HUMAN TORCH/JOHNNY S',
   u'OSBORN, LIZ ALLAN',
   u'DR. STRANGE/STEPHEN ',
   u'DARLA',
   u'RALF',
   u'PAGE, KAREN',
   u'LEE, STAN')),
 (u'CRUSADER II/ARTHUR B',
  (u'HELA [ASGARDIAN]',
   u'X-MAN/NATHAN GREY ',
   u'HOGUN [AS

## One iteration

In [43]:
start_node = 'CAPTAIN AMERICA'

In [44]:
network_to_touch = network_rdd.filter(lambda x: x[0] == start_node)
print network_to_touch.take(1)

[(u'CAPTAIN AMERICA', (u'QUESADA, JOE', u'SILVER SAMURAI/KENYU', u'CAPTAIN AMERICA IV/S', u'DREAMING CELESTIAL/T', u'VOGHT, AMELIA', u'KRO [DEVIANT]', u'PHASTOS [ETERNAL]', u'PUCK DOPPELGANGER', u'POPE', u'POWER PRINCESS/ZARDA', u'MODRED THE MYSTIC', u"JACK O'LANTERN II/MA", u'DR. FAUSTUS', u'GHOST GIRL/', u'HESCAMAR/MARNOT [ASG', u'LORD TEMPLAR/', u'RODGERS, MARIANNE', u'HIJACKER', u'HOCHBERG, ROGER', u'MARROW/SARAH ', u'EPOCH', u'MOTHER NIGHT/SUSAN S', u'PIECEMEAL/GILBERT BE', u'OMNIBUS/LT. DALLAS', u'SHAKTI', u'WHIZZER III/JAMES SA', u'KORATH THE PURSUER', u'WOLFE, HEDY', u'WOLF SPIRIT/OWAYODAT', u'PEREGRINE, LE/FRANCK', u'STRUCKER, ANDREA', u'MARTHA', u'MAGUS', u'DOLLAR BILL', u'MANTIS/? BRANDT', u'CARBON', u'KNIGHT ERRANT/MATSON', u'STUNT-MASTER/GEORGE ', u'NEKRA/ADRIENNE HATRO', u'WHITMAN, DEBRA', u'BRADDOCK, JAMES JR. ', u'DUNER, MAJOR', u'ACHEBE, REVEREND DOC', u'MALICE V/NAKIA', u'BERDITCHEV, ITZHAK', u'SANTINI, DR. JOSE', u'PIPER', u'CAT MAN III/', u'JAMESON, J. JONAH', u'CAP

In [45]:
distance_rdd = network_to_touch.map(lambda x: (x[0], 0))

In [46]:
nodes_to_touch = network_to_touch.flatMap(lambda x: x[1])

In [47]:
unique_nodes_to_touch = nodes_to_touch.distinct()

Now append these to the distance_rdd. 

In [48]:
updated_touched_nodes = unique_nodes_to_touch.map(lambda x: (x, 1))

In [50]:
updated_touched_nodes.take(5)

[(u'QUESADA, JOE', 1),
 (u'PARKER, MAY | TIMESL', 1),
 (u'SILVER SAMURAI/KENYU', 1),
 (u'STORM, CHILI', 1),
 (u'JESTER III', 1)]

In [51]:
updated_distance_rdd = distance_rdd.union(updated_touched_nodes)

In [52]:
updated_distance_rdd.take(5)

[(u'CAPTAIN AMERICA', 0),
 (u'QUESADA, JOE', 1),
 (u'PARKER, MAY | TIMESL', 1),
 (u'SILVER SAMURAI/KENYU', 1),
 (u'STORM, CHILI', 1)]

We now reduce by key!

In [54]:
def get_smaller_value(a, b):
    if a < b:
        return a
    else:
        return b

In [56]:
corrected_distance_rdd = updated_distance_rdd.reduceByKey(get_smaller_value)

In [64]:
muffin = corrected_distance_rdd.collectAsMap()

Now we have to apply this a bunch of times. Then we are all set. The piece I was missing was reduce by key.

Let's make a class to handle this. Let's make that class in a separate file and import it or it's going to be very annoying.

Also, let's figure out how to join the distance rdd to what we need.

In [100]:
already_touched_rdd = corrected_distance_rdd.map(lambda x: x[0])
already_touched = already_touched_rdd.collect()

In [92]:
already_touched_set = set(already_touched)

In [96]:
broadcasted_touched = sc.broadcast(already_touched_set)

In [98]:
network_to_touch = network_rdd.filter(lambda x: x[0] in broadcasted_touched.value)

This is a good way to do it and avoids a nasty join.

# Testing my class