In [3]:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11 pyspark-shell")

In [6]:
try:
    sc and spark
except NameError as e:
#     import findspark
#     findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession.builder.appName("test").getOrCreate()
# except Exception as e:
#     print(e)

In [4]:
print(pyspark.version)
print(spark.version)

<module 'pyspark.version' from '/usr/local/spark/python/pyspark/version.py'>
2.4.3


In [4]:
import graphframes as gf

In [3]:
# do something to prove it works
spark.sql('SELECT "Test" as c1').show()

+----+
|  c1|
+----+
|Test|
+----+



In [5]:
# import data
bikeStations = spark.read.option("header","true")\
  .csv("data/201508_station_data.csv")
tripData = spark.read.option("header","true")\
  .csv("data/201508_trip_data.csv")

In [6]:
# convert to nodes and edges
stationVertices = bikeStations.withColumnRenamed("name", "id").distinct()
tripEdges = tripData\
  .withColumnRenamed("Start Station", "src")\
  .withColumnRenamed("End Station", "dst")

In [21]:
for l in [bikeStations.first(), stationVertices.first()]: print(l)

Row(station_id='2', name='San Jose Diridon Caltrain Station', lat='37.329732', long='-121.901782', dockcount='27', landmark='San Jose', installation='8/6/2013')
Row(station_id='51', id='Embarcadero at Folsom', lat='37.791464', long='-122.391034', dockcount='19', landmark='San Francisco', installation='8/20/2013')


In [23]:
for l in [tripData.first(), tripEdges.first()]: print(l)

Row(Trip ID='913460', Duration='765', Start Date='8/31/2015 23:26', Start Station='Harry Bridges Plaza (Ferry Building)', Start Terminal='50', End Date='8/31/2015 23:39', End Station='San Francisco Caltrain (Townsend at 4th)', End Terminal='70', Bike #='288', Subscriber Type='Subscriber', Zip Code='2139')
Row(Trip ID='913460', Duration='765', Start Date='8/31/2015 23:26', src='Harry Bridges Plaza (Ferry Building)', Start Terminal='50', End Date='8/31/2015 23:39', dst='San Francisco Caltrain (Townsend at 4th)', End Terminal='70', Bike #='288', Subscriber Type='Subscriber', Zip Code='2139')


In [7]:
stationGraph = gf.GraphFrame(stationVertices, tripEdges)
stationGraph.cache()

GraphFrame(v:[id: string, station_id: string ... 5 more fields], e:[src: string, dst: string ... 9 more fields])

In [8]:
from pyspark.sql.functions import desc
stationGraph.edges.groupBy("src", "dst").count().orderBy(desc("count")).show(10)

+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th| 3748|
|Harry Bridges Pla...|Embarcadero at Sa...| 3145|
|     2nd at Townsend|Harry Bridges Pla...| 2973|
|     Townsend at 7th|San Francisco Cal...| 2734|
|Harry Bridges Pla...|     2nd at Townsend| 2640|
|Embarcadero at Fo...|San Francisco Cal...| 2439|
|   Steuart at Market|     2nd at Townsend| 2356|
|Embarcadero at Sa...|   Steuart at Market| 2330|
|     Townsend at 7th|San Francisco Cal...| 2192|
|Temporary Transba...|San Francisco Cal...| 2184|
+--------------------+--------------------+-----+
only showing top 10 rows



In [9]:
stationGraph.edges\
  .where("src = 'Townsend at 7th' OR dst = 'Townsend at 7th'")\
  .groupBy("src", "dst").count()\
  .orderBy(desc("count"))\
  .show(10)

+--------------------+--------------------+-----+
|                 src|                 dst|count|
+--------------------+--------------------+-----+
|San Francisco Cal...|     Townsend at 7th| 3748|
|     Townsend at 7th|San Francisco Cal...| 2734|
|     Townsend at 7th|San Francisco Cal...| 2192|
|     Townsend at 7th|Civic Center BART...| 1844|
|Civic Center BART...|     Townsend at 7th| 1765|
|San Francisco Cal...|     Townsend at 7th| 1198|
|Temporary Transba...|     Townsend at 7th|  834|
|     Townsend at 7th|Harry Bridges Pla...|  827|
|   Steuart at Market|     Townsend at 7th|  746|
|     Townsend at 7th|Temporary Transba...|  740|
+--------------------+--------------------+-----+
only showing top 10 rows



In [25]:
# motif finding
motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")
from pyspark.sql.functions import expr
motifs.selectExpr("*",
    "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart",
    "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart",
    "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\
  .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\
  .where("a.id != b.id").where("b.id != c.id")\
  .where("abStart < bcStart").where("bcStart < caStart")\
  .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\
  .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")\
  .limit(1).show(1, False)

+---------------------------------------+---------------+----------------------------------------+---------------+---------------+
|id                                     |id             |id                                      |Start Date     |End Date       |
+---------------------------------------+---------------+----------------------------------------+---------------+---------------+
|San Francisco Caltrain 2 (330 Townsend)|Townsend at 7th|San Francisco Caltrain (Townsend at 4th)|5/19/2015 16:09|5/19/2015 16:33|
+---------------------------------------+---------------+----------------------------------------+---------------+---------------+



In [26]:
# pagerank algo
from pyspark.sql.functions import desc
ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10)

+--------------------+------------------+
|                  id|          pagerank|
+--------------------+------------------+
|San Jose Diridon ...|  4.05150483599002|
|San Francisco Cal...| 3.351183296428697|
|Mountain View Cal...| 2.514390771015544|
|Redwood City Calt...|2.3263087713711714|
|San Francisco Cal...|2.2311442913698833|
|Harry Bridges Pla...|1.8251120118882476|
|     2nd at Townsend|1.5821217785038693|
|Santa Clara at Al...|1.5730074084907586|
|     Townsend at 7th| 1.568456580534055|
|Embarcadero at Sa...| 1.541424208774859|
+--------------------+------------------+
only showing top 10 rows



In [27]:
inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)

+----------------------------------------+--------+
|id                                      |inDegree|
+----------------------------------------+--------+
|San Francisco Caltrain (Townsend at 4th)|34810   |
|San Francisco Caltrain 2 (330 Townsend) |22523   |
|Harry Bridges Plaza (Ferry Building)    |17810   |
|2nd at Townsend                         |15463   |
|Townsend at 7th                         |15422   |
+----------------------------------------+--------+
only showing top 5 rows



In [28]:
outDeg = stationGraph.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)

+---------------------------------------------+---------+
|id                                           |outDegree|
+---------------------------------------------+---------+
|San Francisco Caltrain (Townsend at 4th)     |26304    |
|San Francisco Caltrain 2 (330 Townsend)      |21758    |
|Harry Bridges Plaza (Ferry Building)         |17255    |
|Temporary Transbay Terminal (Howard at Beale)|14436    |
|Embarcadero at Sansome                       |14158    |
+---------------------------------------------+---------+
only showing top 5 rows



In [29]:
degreeRatio = inDeg.join(outDeg, "id")\
  .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")
degreeRatio.orderBy(desc("degreeRatio")).show(10, False)
degreeRatio.orderBy("degreeRatio").show(10, False)

+----------------------------------------+------------------+
|id                                      |degreeRatio       |
+----------------------------------------+------------------+
|Redwood City Medical Center             |1.5333333333333334|
|San Mateo County Center                 |1.4724409448818898|
|SJSU 4th at San Carlos                  |1.3621052631578947|
|San Francisco Caltrain (Townsend at 4th)|1.3233728710462287|
|Washington at Kearny                    |1.3086466165413533|
|Paseo de San Antonio                    |1.2535046728971964|
|California Ave Caltrain Station         |1.24              |
|Franklin at Maple                       |1.2345679012345678|
|Embarcadero at Vallejo                  |1.2201707365495336|
|Market at Sansome                       |1.2173913043478262|
+----------------------------------------+------------------+
only showing top 10 rows

+-------------------------------+------------------+
|id                             |degreeRatio       |


In [30]:
stationGraph.bfs(fromExpr="id = 'Townsend at 7th'",
  toExpr="id = 'Spear at Folsom'", maxPathLength=2).show(10)

+--------------------+--------------------+--------------------+
|                from|                  e0|                  to|
+--------------------+--------------------+--------------------+
|[65, Townsend at ...|[913371, 663, 8/3...|[49, Spear at Fol...|
|[65, Townsend at ...|[913265, 658, 8/3...|[49, Spear at Fol...|
|[65, Townsend at ...|[911919, 722, 8/3...|[49, Spear at Fol...|
|[65, Townsend at ...|[910777, 704, 8/2...|[49, Spear at Fol...|
|[65, Townsend at ...|[908994, 1115, 8/...|[49, Spear at Fol...|
|[65, Townsend at ...|[906912, 892, 8/2...|[49, Spear at Fol...|
|[65, Townsend at ...|[905201, 980, 8/2...|[49, Spear at Fol...|
|[65, Townsend at ...|[904010, 969, 8/2...|[49, Spear at Fol...|
|[65, Townsend at ...|[903375, 850, 8/2...|[49, Spear at Fol...|
|[65, Townsend at ...|[899944, 910, 8/2...|[49, Spear at Fol...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [31]:
# connectedComponents
spark.sparkContext.setCheckpointDir("/tmp/checkpoints")

In [33]:
# in Python
minGraph = gf.GraphFrame(stationVertices, tripEdges.sample(False, 0.1))
cc = minGraph.connectedComponents()

In [34]:
cc.where("component != 0").show()

+----------+--------------------+---------+-----------+---------+-------------+------------+------------+
|station_id|                  id|      lat|       long|dockcount|     landmark|installation|   component|
+----------+--------------------+---------+-----------+---------+-------------+------------+------------+
|        47|     Post at Kearney|37.788975|-122.403452|       19|San Francisco|   8/19/2013|317827579904|
|        46|Washington at Kea...|37.795425|-122.404767|       15|San Francisco|   8/19/2013| 17179869184|
+----------+--------------------+---------+-----------+---------+-------------+------------+------------+



# sys data

In [4]:
sc.addPyFile('graphframes.zip')

In [7]:
import sys
sys.path

['/home/jovyan/work',
 '/tmp/spark-2efe940d-a9c3-4bd2-81df-1d42311cfb25/userFiles-e08b890c-cb9d-4949-ab85-457e8e71bb05/org.slf4j_slf4j-api-1.7.16.jar',
 '/tmp/spark-2efe940d-a9c3-4bd2-81df-1d42311cfb25/userFiles-e08b890c-cb9d-4949-ab85-457e8e71bb05/graphframes_graphframes-0.7.0-spark2.4-s_2.11.jar',
 '/tmp/spark-2efe940d-a9c3-4bd2-81df-1d42311cfb25/userFiles-e08b890c-cb9d-4949-ab85-457e8e71bb05',
 '/usr/local/spark/python',
 '/usr/local/spark/python/lib/py4j-0.10.7-src.zip',
 '/opt/conda/lib/python37.zip',
 '/opt/conda/lib/python3.7',
 '/opt/conda/lib/python3.7/lib-dynload',
 '',
 '/opt/conda/lib/python3.7/site-packages',
 '/opt/conda/lib/python3.7/site-packages/IPython/extensions',
 '/home/jovyan/.ipython']

In [4]:
import os
os.environ

environ{'LC_ALL': 'en_US.UTF-8',
        'LANG': 'en_US.UTF-8',
        'HOSTNAME': 'b96fa138fd0d',
        'NB_UID': '1000',
        'CONDA_DIR': '/opt/conda',
        'CONDA_VERSION': '4.7.5',
        'MESOS_NATIVE_LIBRARY': '/usr/local/lib/libmesos.so',
        'PWD': '/home/jovyan',
        'HOME': '/home/jovyan',
        'DEBIAN_FRONTEND': 'noninteractive',
        'SPARK_HOME': '/usr/local/spark',
        'NB_USER': 'jovyan',
        'HADOOP_VERSION': '2.7',
        'SHELL': '/bin/bash',
        'SPARK_OPTS': '--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info',
        'APACHE_SPARK_VERSION': '2.4.3',
        'SHLVL': '0',
        'LANGUAGE': 'en_US.UTF-8',
        'PYTHONPATH': '/usr/local/spark/python:/usr/local/spark/python/lib/py4j-0.10.7-src.zip',
        'XDG_CACHE_HOME': '/home/jovyan/.cache/',
        'NB_GID': '100',
        'PATH': '/opt/conda/bin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/