In [2]:
# For Google Colaboratory
!pip install pyspark py4j findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("GraphFrames in Colab") \
        .config("spark.jars.packages", "graphframes:graphframes:0.8.0-spark3.0-s_2.12") \
        .getOrCreate()

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [4]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/MyDrive/Big_Data/Practicals' # Please adjust the path accordingly
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/MyDrive/Big_Data/Practicals


# Practical 4: Graph Analytics



## Load Data


In [5]:
bikeStations = spark.read.option("header","true")\
  .csv("file:/content/gdrive/MyDrive/Big_Data/Practicals/bike-data/201508_station_data.csv")
tripData = spark.read.option("header","true")\
  .csv("file:/content/gdrive/MyDrive/Big_Data/Practicals/bike-data/201508_trip_data.csv")


In [6]:
stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
  .withColumnRenamed("Start Station", "src")\
  .withColumnRenamed("End Station", "dst")



## Building a Graph


In [7]:
from graphframes import GraphFrame
stationGraph = GraphFrame(stationVertices, tripEdges)
stationGraph.cache()




GraphFrame(v:[id: string, station_id: string ... 5 more fields], e:[src: string, dst: string ... 9 more fields])

In [8]:
print("Total Number of Stations: " + str(stationGraph.vertices.count()))
print("Total Number of Trips in Graph: " + str(stationGraph.edges.count()))
print("Total Number of Trips in Original Data: " + str(tripData.count()))


Total Number of Stations: 70
Total Number of Trips in Graph: 354152
Total Number of Trips in Original Data: 354152



## Querying the Graph


In [9]:
from pyspark.sql.functions import desc
stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(10)


+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th| 3748|
|Harry Bridges Pla...|Embarcadero at Sa...| 3145|
|     2nd at Townsend|Harry Bridges Pla...| 2973|
|     Townsend at 7th|San Francisco Cal...| 2734|
|Harry Bridges Pla...|     2nd at Townsend| 2640|
|Embarcadero at Fo...|San Francisco Cal...| 2439|
|   Steuart at Market|     2nd at Townsend| 2356|
|Embarcadero at Sa...|   Steuart at Market| 2330|
|     Townsend at 7th|San Francisco Cal...| 2192|
|Temporary Transba...|San Francisco Cal...| 2184|
+--------------------+--------------------+-----+
only showing top 10 rows



In [10]:
stationGraph.edges\
  .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")\
  .groupBy("src", "dst").count()\
  .orderBy(desc("count"))\
  .show(10)


+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th| 3748|
|     Townsend at 7th|San Francisco Cal...| 2734|
|     Townsend at 7th|San Francisco Cal...| 2192|
|     Townsend at 7th|Civic Center BART...| 1844|
|Civic Center BART...|     Townsend at 7th| 1765|
|San Francisco Cal...|     Townsend at 7th| 1198|
|Temporary Transba...|     Townsend at 7th|  834|
|     Townsend at 7th|Harry Bridges Pla...|  827|
|   Steuart at Market|     Townsend at 7th|  746|
|     Townsend at 7th|Temporary Transba...|  740|
+--------------------+--------------------+-----+
only showing top 10 rows




## Subgraphs


In [11]:
townAnd7thEdges = stationGraph.edges\
  .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")
subgraph = GraphFrame(stationGraph.vertices, townAnd7thEdges)



## Graph Algorithms


In [12]:
#PageRank
from pyspark.sql.functions import desc
ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10)




+--------------------+------------------+
|                  id|          pagerank|
+--------------------+------------------+
|San Jose Diridon ...| 4.051504835990035|
|San Francisco Cal...|3.3511832964285686|
|Mountain View Cal...|2.5143907710154836|
|Redwood City Calt...|2.3263087713711736|
|San Francisco Cal...| 2.231144291369887|
|Harry Bridges Pla...|1.8251120118883386|
|     2nd at Townsend|1.5821217785038677|
|Santa Clara at Al...| 1.573007408490723|
|     Townsend at 7th|1.5684565805341752|
|Embarcadero at Sa...| 1.541424208774836|
+--------------------+------------------+
only showing top 10 rows



In [13]:
#In-Degree Metrics
inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)


+----------------------------------------+--------+
|id                                      |inDegree|
+----------------------------------------+--------+
|San Francisco Caltrain (Townsend at 4th)|34810   |
|San Francisco Caltrain 2 (330 Townsend) |22523   |
|Harry Bridges Plaza (Ferry Building)    |17810   |
|2nd at Townsend                         |15463   |
|Townsend at 7th                         |15422   |
+----------------------------------------+--------+
only showing top 5 rows



In [14]:
#Out-Degree Metrics
outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)


+---------------------------------------------+---------+
|id                                           |outDegree|
+---------------------------------------------+---------+
|San Francisco Caltrain (Townsend at 4th)     |26304    |
|San Francisco Caltrain 2 (330 Townsend)      |21758    |
|Harry Bridges Plaza (Ferry Building)         |17255    |
|Temporary Transbay Terminal (Howard at Beale)|14436    |
|Embarcadero at Sansome                       |14158    |
+---------------------------------------------+---------+
only showing top 5 rows



In [15]:
degreeRatio = inDeg.join(outDeg, "id")\
  .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
degreeRatio.orderBy(desc("degreeRatio")).show(10, False)
degreeRatio.orderBy("degreeRatio").show(10, False)


+----------------------------------------+------------------+
|id                                      |degreeRatio       |
+----------------------------------------+------------------+
|Redwood City Medical Center             |1.5333333333333334|
|San Mateo County Center                 |1.4724409448818898|
|SJSU 4th at San Carlos                  |1.3621052631578947|
|San Francisco Caltrain (Townsend at 4th)|1.3233728710462287|
|Washington at Kearny                    |1.3086466165413533|
|Paseo de San Antonio                    |1.2535046728971964|
|California Ave Caltrain Station         |1.24              |
|Franklin at Maple                       |1.2345679012345678|
|Embarcadero at Vallejo                  |1.2201707365495336|
|Market at Sansome                       |1.2173913043478262|
+----------------------------------------+------------------+
only showing top 10 rows

+-------------------------------+------------------+
|id                             |degreeRatio       |


In [16]:
#Breadth-First Search
stationGraph.bfs(fromExpr="id = 'Townsend at 7th'",
  toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10)


+--------------------+--------------------+--------------------+
|                from|                  e0|                  to|
+--------------------+--------------------+--------------------+
|{65, Townsend at ...|{913371, 663, 8/3...|{49, Spear at Fol...|
|{65, Townsend at ...|{913265, 658, 8/3...|{49, Spear at Fol...|
|{65, Townsend at ...|{911919, 722, 8/3...|{49, Spear at Fol...|
|{65, Townsend at ...|{910777, 704, 8/2...|{49, Spear at Fol...|
|{65, Townsend at ...|{908994, 1115, 8/...|{49, Spear at Fol...|
|{65, Townsend at ...|{906912, 892, 8/2...|{49, Spear at Fol...|
|{65, Townsend at ...|{905201, 980, 8/2...|{49, Spear at Fol...|
|{65, Townsend at ...|{904010, 969, 8/2...|{49, Spear at Fol...|
|{65, Townsend at ...|{903375, 850, 8/2...|{49, Spear at Fol...|
|{65, Townsend at ...|{899944, 910, 8/2...|{49, Spear at Fol...|
+--------------------+--------------------+--------------------+
only showing top 10 rows

