In [12]:
# basics
import networkx as nx
import matplotlib.pyplot as plt

# PySpark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import split, col, lit, explode

# graph networks
from graphframes import *

### Big Data Platforms

**Goal of this notebook:** Generate meaningful insights using graph networks.

### 1. Read necessary data into spark

In [13]:
# start spark session
spark = SparkSession.builder.appName('GraphFramesAPI').getOrCreate()

# increase memory

# cores, executors

# spark context
sc = SparkContext.getOrCreate()

In [14]:
# define location variables
FILE_PATH = "hdfs://nameservice1/user/vvenkatesan/final_project/"
FILE_NAME = "yelp_academic_dataset_user.json"

# load data from hdfs
user = spark.read.json(str(FILE_PATH) + str(FILE_NAME))

# limit data
#user = user.limit(10)

### 2. Create graph

In [15]:
# convert friends to array
user = user.withColumn("friends_array", split(col("friends"), ","))

# if there are > 1 friends, select first friend in array 
user = user.withColumn("friends_one", F.col("friends_array")[0])

In [16]:
# define vertices
vertices = user.select("user_id", "name", "yelping_since", "review_count", "useful", "average_stars")

# rename columns
vertices = vertices.withColumnRenamed("user_id", "id")

# display
vertices.show(5)

+--------------------+--------+-------------------+------------+------+-------------+
|                  id|    name|      yelping_since|review_count|useful|average_stars|
+--------------------+--------+-------------------+------------+------+-------------+
|ntlvfPzc8eglqvk92...|  Rafael|2007-07-06 03:27:11|         553|   628|         3.57|
|FOBRPlBHa3WPHFB5q...|Michelle|2008-04-28 01:29:25|         564|   790|         3.84|
|zZUnPeh2hEp0WydbA...|  Martin|2008-08-28 23:40:05|          60|   151|         3.44|
|QaELAmRcDc5TfJEyl...|    John|2008-09-20 00:08:14|         206|   233|         3.08|
|xvu8G900tezTzbbfq...|    Anne|2008-08-09 00:30:27|         485|  1265|         4.37|
+--------------------+--------+-------------------+------------+------+-------------+
only showing top 5 rows



In [17]:
# define edges
edges = user.select("user_id", explode(split(col("friends"), ",\s*")).alias("friends_split"))

# create friend column
edges = edges.withColumn("type", lit("friend"))

# rename columns
edges = edges.withColumnRenamed("user_id", "src").withColumnRenamed("friends_split", "dst")

# display
edges.show(5)

+--------------------+--------------------+------+
|                 src|                 dst|  type|
+--------------------+--------------------+------+
|ntlvfPzc8eglqvk92...|oeMvJh94PiGQnx_6G...|friend|
|ntlvfPzc8eglqvk92...|wm1z1PaJKvHgSDRKf...|friend|
|ntlvfPzc8eglqvk92...|IkRib6Xs91PPW7pon...|friend|
|ntlvfPzc8eglqvk92...|A8Aq8f0-XvLBcyMk2...|friend|
|ntlvfPzc8eglqvk92...|eEZM1kogR7eL4GOBZ...|friend|
+--------------------+--------------------+------+
only showing top 5 rows



In [18]:
# create a graph using the vertices and edges dataframes
g = GraphFrame(vertices, edges)

### 3. Explore graph

In [19]:
# check edges
g.edges.show(5)

+--------------------+--------------------+------+
|                 src|                 dst|  type|
+--------------------+--------------------+------+
|ntlvfPzc8eglqvk92...|oeMvJh94PiGQnx_6G...|friend|
|ntlvfPzc8eglqvk92...|wm1z1PaJKvHgSDRKf...|friend|
|ntlvfPzc8eglqvk92...|IkRib6Xs91PPW7pon...|friend|
|ntlvfPzc8eglqvk92...|A8Aq8f0-XvLBcyMk2...|friend|
|ntlvfPzc8eglqvk92...|eEZM1kogR7eL4GOBZ...|friend|
+--------------------+--------------------+------+
only showing top 5 rows



In [20]:
# check vertices
g.vertices.show(5)

+--------------------+--------+-------------------+------------+------+-------------+
|                  id|    name|      yelping_since|review_count|useful|average_stars|
+--------------------+--------+-------------------+------------+------+-------------+
|ntlvfPzc8eglqvk92...|  Rafael|2007-07-06 03:27:11|         553|   628|         3.57|
|FOBRPlBHa3WPHFB5q...|Michelle|2008-04-28 01:29:25|         564|   790|         3.84|
|zZUnPeh2hEp0WydbA...|  Martin|2008-08-28 23:40:05|          60|   151|         3.44|
|QaELAmRcDc5TfJEyl...|    John|2008-09-20 00:08:14|         206|   233|         3.08|
|xvu8G900tezTzbbfq...|    Anne|2008-08-09 00:30:27|         485|  1265|         4.37|
+--------------------+--------+-------------------+------------+------+-------------+
only showing top 5 rows



In [21]:
# filter vertices to show users with > 10 reviews
g.vertices.filter("review_count > 10").orderBy("review_count", ascending=False).show(5)

+--------------------+------+-------------------+------------+------+-------------+
|                  id|  name|      yelping_since|review_count|useful|average_stars|
+--------------------+------+-------------------+------------+------+-------------+
|8k3aO-mPeyhbR5HUu...|Victor|2007-12-08 14:56:45|       14455|101960|         3.28|
|RtGqdDBvvBCjcu5dU...| Shila|2010-10-17 06:35:06|       12772| 10508|         3.87|
|hWDybu_KvYLSdEFzG...| Bruce|2009-03-08 21:47:44|       12487|108027|         3.64|
|Hi10sGSZNxQH3NLyW...|   Fox|2009-05-26 11:33:58|       11112|145838|          3.8|
|P5bUL3Engv-2z6kKo...|   Kim|2006-05-31 21:27:42|        9875| 26750|          3.8|
+--------------------+------+-------------------+------------+------+-------------+
only showing top 5 rows



In [104]:
# run triangle count
g.triangleCount().orderBy("count", ascending=False).show()

+------+--------------------+---------+-------------------+------------+------+-------------+
| count|                  id|     name|      yelping_since|review_count|useful|average_stars|
+------+--------------------+---------+-------------------+------------+------+-------------+
|342190|AbMjnKOwg736fcIu8...|  Michael|2010-05-16 19:27:37|        2867|100512|         3.64|
|339879|ZIOCmdFaMIF56FR-n...|    Randy|2011-01-07 15:20:32|        2922| 39030|         3.77|
|334130|WDpvs6U_TCx4799ox...|Cassandra|2009-02-20 22:43:13|        1590| 14856|         3.93|
|325467|QdfFtp430K_-BP1GT...|   Joseph|2010-01-06 21:21:48|         744| 10977|         3.68|
|304510|yLW8OrR8Ns4X1oXJm...| Kimquyen|2007-07-18 18:51:20|         891| 11398|         3.72|
|290329|vRjVhl3ONG2GfWY4N...|      Dan|2007-07-16 18:09:50|         641|  8538|         3.33|
|260811|rLjFbA_E3GLrBIKN-...| Michelle|2010-05-19 00:07:35|         596|  7421|         3.73|
|242893|EXKDLuYoZUKUNNNYA...|     Gene|2013-05-27 19:21:44| 

In [45]:
# filter and sort vertices with degree >=2 (vertex measurement quantifying the number of connected edges)
g.inDegrees.filter("inDegree >= 2").sort("inDegree", ascending=False).show(10, truncate=False)

+----------------------+--------+
|id                    |inDegree|
+----------------------+--------+
|None                  |834851  |
|ZIOCmdFaMIF56FR-nWr_2A|5266    |
|Oi1qbcz2m2SnwUeztGYcnQ|5059    |
|8DEyKVyplnOcSKx39vatbg|4957    |
|yLW8OrR8Ns4X1oXJmkKYgg|4475    |
|hizGc5W1tBHPghM5YKCAtg|4460    |
|djxnI8Ux8ZYQJhiOQkrRhA|4438    |
|YttDgOC9AlM4HcAlDsbB2A|4346    |
|qVc8ODYU5SZjKXVBgXdI7w|4186    |
|iLjMdZi0Tm7DQxX1C1_2dg|4168    |
+----------------------+--------+
only showing top 10 rows



In [84]:
# find mutual friends
mutualFriends =  g.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(b); (b)-[]->(a)").dropDuplicates()

# display
mutualFriends.show(5, truncate = False)

+-------------------------------------------------------------------+----------------------------------------------------------------------+-------------------------------------------------------------------+
|a                                                                  |b                                                                     |c                                                                  |
+-------------------------------------------------------------------+----------------------------------------------------------------------+-------------------------------------------------------------------+
|[xgPZlC7scdHMr5Reo3DEmg, Aileen, 2011-12-10 22:34:25, 8, 12, 3.22] |[viprlZujrzuKrPdzECUgSw, Joe, 2012-01-16 06:20:01, 94, 122, 2.91]     |[xgPZlC7scdHMr5Reo3DEmg, Aileen, 2011-12-10 22:34:25, 8, 12, 3.22] |
|[pCWTaritjAU4efFc1ZEOcA, Douglas, 2013-02-16 23:00:38, 21, 8, 4.5] |[GFvqdfp1-KA8v-5RoS6kvw, Stephanie, 2010-10-23 23:20:44, 21, 24, 3.52]|[pCWTaritjAU4efFc1ZEOcA,

In [43]:
# look at the pagerank score for every vertex
pr = g.pageRank(resetProbability=0.15, tol=0.01)
pr.vertices.show()

# look at the weight of every edge
pr.edges.show()

Py4JJavaError: An error occurred while calling o269.run.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 33 in stage 52.0 failed 4 times, most recent failure: Lost task 33.3 in stage 52.0 (TID 2611, hd03.rcc.local, executor 184): java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1890)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:929)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:929)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2111)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2049)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:740)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2081)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2178)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1035)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1017)
	at org.apache.spark.graphx.impl.VertexRDDImpl.count(VertexRDDImpl.scala:90)
	at org.apache.spark.graphx.Pregel$.apply(Pregel.scala:140)
	at org.apache.spark.graphx.lib.PageRank$.runUntilConvergenceWithOptions(PageRank.scala:355)
	at org.graphframes.lib.PageRank$.runUntilConvergence(PageRank.scala:152)
	at org.graphframes.lib.PageRank.run(PageRank.scala:102)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.graphframes.GraphFrame$$anonfun$5
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.serializer.JavaDeserializationStream$$anon$1.resolveClass(JavaSerializer.scala:67)
	at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1868)
	at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1751)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2042)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2287)
	at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2211)
	at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2069)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1573)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:431)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:75)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:114)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:88)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$11.apply(Executor.scala:407)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1408)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:413)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
