# Problem 4

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.ticker as ticker   

from P4_bfs import *

# setup spark
conf = SparkConf().setAppName('Graph Processing')
sc = SparkContext(conf=conf)
from P4_bfs import *

import pandas as pd

In [2]:
# Conversion from MarvelGraph to adj. list representation
comicRDD = sc.textFile('source.csv').map(lambda x: x.split('","')).map(lambda x: (x[0][1:], x[1][:len(x[1])-1]))

# create Dict (last part can be removed as it does only make indices 1-based)
dictRDD = comicRDD.map(lambda x: x[0]).distinct().sortBy(lambda x: x).zipWithIndex().map(lambda x: (x[0], x[1]+1))

# convert Character to integer for later join!
comicRDD = comicRDD.join(dictRDD).map(lambda x: (x[1][0], x[1][1])).cache()

# join on Comic Issue & remove all reflexive edges
graphRDD = comicRDD.join(comicRDD).map(lambda x: (x[1][0], x[1][1])).filter(lambda x: x[0] != x[1])

# now group s.t. we have for each vertex an adjacency list of nodes
graphRDD = graphRDD.groupByKey().map(lambda x: (x[0], list(x[1])))

In [98]:
# test code
#v0 = [characterDict['CAPTAIN AMERICA'], characterDict['MISS THING/MARY'], characterDict['ORWELL']]
v0 = [characterDict['MISS THING/MARY']]

for i in range(0, len(v0)):
    num_visited_nodes, rdd = sparkBFS(sc, graphRDD, v0[i])
    
    print('%s : %d nodes visited' % (vertexDict[str(v0[i])], num_visited_nodes))

MISS THING/MARY : 6407 nodes visited


In [8]:
# lookup code
def lookUpVertexID(dictRDD, character):
    return dictRDD.filter(lambda x: x[0] == character).collect()[0][1]

def lookUpCharacter(dictRDD, vertexID):
    return dictRDD.filter(lambda x: x[1] == vertexID).collect()[0][0]

lookUpCharacter(dictRDD, lookUpVertexID(dictRDD, 'MISS THING/MARY'))

u'MISS THING/MARY'

In [2]:
comicdf = pd.read_csv('source.csv', names=['Character', 'ComicIssue'])

In [3]:
keys = comicdf['Character'].unique()

In [4]:
values = range(1, len(keys)+1) # start indices with 1

In [5]:
characterDict = dict(zip(keys, values))

In [6]:
characterDict['FROST, CARMILLA']

1

In [7]:
# create graph by adjacency list (there is a connection between
# two characters if they appear in the same comic issue)

# therefore join the comicdf on comic issue!
mergeddf = pd.merge(comicdf, comicdf, how='inner', on='ComicIssue')

# now remove ComicIssue & all rows with Character_x == Character_y
filtereddf = mergeddf.drop('ComicIssue', 1)
filtereddf = filtereddf[filtereddf['Character_x'] != filtereddf['Character_y']]

In [8]:
# transform string to integers for performance reasons
edgedf = filtereddf.applymap(lambda x: characterDict[x])

In [9]:
import csv

# save the edge list and the dictionary as two separate csv's
edgedf.to_csv('edge_list.csv', header=False, index=False)

writer = csv.writer(open('characters.csv', 'wb'))
entries = sorted(characterDict.items(), key=lambda x: x[1]);
for key, value in entries:
    writer.writerow([value, key]) # flip it (so the vertex index is now the key)

In [10]:
# load vertex dictionary (is basically the character dictionary inverted)
reader = csv.reader(open('characters.csv', 'rb'))
vertexDict = dict(reader)

In [11]:
# this function prepares the rdd
def prepare_rdd(filename):
    rdd = sc.textFile(filename)

    # map string to tuples
    rdd = rdd.map(lambda x: x.split(','))
    rdd = rdd.map(lambda x: (int(x[0]), int(x[1])))
    
    # now group s.t. we have for each vertex an adjacency list of nodes
    rdd = rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
    
    return rdd

In [12]:
# test code
filename = 'edge_list.csv'#'edge_list_simple.csv' # 'edge_list.csv'

rdd = prepare_rdd(filename)
#rdd.take(5)
v0 = [characterDict['CAPTAIN AMERICA'], characterDict['MISS THING/MARY'], characterDict['ORWELL']]



In [13]:
for i in range(0, len(v0)):
    rdd = prepare_rdd(filename)
    num_visited_nodes, rdd = sparkBFS(sc, rdd, v0[i])
    
    print('%s : %d nodes visited' % (vertexDict[str(v0[i])], num_visited_nodes))

CAPTAIN AMERICA : 6407 nodes visited
MISS THING/MARY : 6 nodes visited
ORWELL : 8 nodes visited


In [49]:
vertexDict[str(v0[0])]

'CAPTAIN AMERICA'

In [50]:
num_visited_nodes

8