In [1]:
from graphframes import *

In [24]:
from pyspark import *
from pyspark.sql import *
spark = SparkSession.builder.appName('fun').getOrCreate()
vertices = spark.createDataFrame([('1', 'Carter', 'Derrick', 50), 
                                  ('2', 'May', 'Derrick', 26),
                                 ('3', 'Mills', 'Jeff', 80),
                                  ('4', 'Hood', 'Robert', 65),
                                  ('5', 'Banks', 'Mike', 93),
                                 ('98', 'Berg', 'Tim', 28),
                                 ('99', 'Page', 'Allan', 16)],
                                 ['id', 'name', 'firstname', 'age'])
edges = spark.createDataFrame([('1', '2', 'friend'), 
                               ('2', '1', 'friend'),
                              ('3', '1', 'friend'),
                              ('1', '3', 'friend'),
                               ('2', '3', 'follows'),
                               ('3', '4', 'friend'),
                               ('4', '3', 'friend'),
                               ('5', '3', 'friend'),
                               ('3', '5', 'friend'),
                               ('4', '5', 'follows'),
                              ('98', '99', 'friend'),
                              ('99', '98', 'friend')],
                              ['src', 'dst', 'type'])
g = GraphFrame(vertices, edges)
## Take a look at the DataFrames
g.vertices.show()
g.edges.show()
## Check the number of edges of each vertex
g.degrees.show()

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  2|   May|  Derrick| 26|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
| 98|  Berg|      Tim| 28|
| 99|  Page|    Allan| 16|
+---+------+---------+---+

+----+---+-------+
| src|dst|   type|
+----+---+-------+
|   1|  2| friend|
|   2|  1| friend|
|   3|  1| friend|
|   1|  3| friend|
|   2|  3|follows|
|   3|  4| friend|
|   4|  3| friend|
|   5|  3| friend|
|   3|  5| friend|
|   4|  5|follows|
|  98| 99| friend|
|1199| 98| friend|
+----+---+-------+

+----+------+
|  id|degree|
+----+------+
|   3|     7|
|  98|     2|
|  99|     1|
|   5|     3|
|1199|     1|
|   1|     4|
|   4|     3|
|   2|     3|
+----+------+



In [3]:
copy = edges
from pyspark.sql.functions import udf
@udf("string")
def to_undir(src, dst):
    if src >= dst:
        return 'Delete'
    else : 
        return 'Keep'
copy.withColumn('undir', to_undir(copy.src, copy.dst))\
.filter('undir == "Keep"').drop('undir').show()
## for efficiency, it's better to avoid udf functions where possible ## and use built-in pyspark.sql.functions instead.

+---+---+-------+
|src|dst|   type|
+---+---+-------+
|  1|  2| friend|
|  1|  3| friend|
|  2|  3|follows|
|  3|  4| friend|
|  3|  5| friend|
|  4|  5|follows|
| 98| 99| friend|
+---+---+-------+



In [4]:
dir(edges)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collectAsArrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_sort_cols',
 '_support_repr_html',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'exceptAll',
 'explain',
 'fillna',
 'filter',
 'first',
 'foreach',
 'f

In [5]:
g.vertices.filter("age > 30").show()
g.inDegrees.filter("inDegree >= 2").sort("inDegree", ascending=False).show()
g.edges.filter('type == "friend"')

+---+------+---------+---+
| id|  name|firstname|age|
+---+------+---------+---+
|  1|Carter|  Derrick| 50|
|  3| Mills|     Jeff| 80|
|  4|  Hood|   Robert| 65|
|  5| Banks|     Mike| 93|
+---+------+---------+---+

+---+--------+
| id|inDegree|
+---+--------+
|  3|       4|
|  5|       2|
|  1|       2|
+---+--------+



DataFrame[src: string, dst: string, type: string]

In [7]:
# dir(g)

sc.setCheckpointDir('graphframes_cps')

In [8]:
g.connectedComponents().show()


+---+------+---------+---+------------+
| id|  name|firstname|age|   component|
+---+------+---------+---+------------+
|  1|Carter|  Derrick| 50|154618822656|
|  2|   May|  Derrick| 26|154618822656|
|  3| Mills|     Jeff| 80|154618822656|
|  4|  Hood|   Robert| 65|154618822656|
|  5| Banks|     Mike| 93|154618822656|
| 98|  Berg|      Tim| 28|317827579904|
| 99|  Page|    Allan| 16|317827579904|
+---+------+---------+---+------------+



In [9]:
g.find("(a)-[e]->(b); (b)-[e2]->(a)").show()

+--------------------+----------------+--------------------+----------------+
|                   a|               e|                   b|              e2|
+--------------------+----------------+--------------------+----------------+
| [98, Berg, Tim, 28]|[98, 99, friend]|[99, Page, Allan,...|[99, 98, friend]|
|[2, May, Derrick,...|  [2, 1, friend]|[1, Carter, Derri...|  [1, 2, friend]|
|[99, Page, Allan,...|[99, 98, friend]| [98, Berg, Tim, 28]|[98, 99, friend]|
|[3, Mills, Jeff, 80]|  [3, 5, friend]|[5, Banks, Mike, 93]|  [5, 3, friend]|
|[1, Carter, Derri...|  [1, 3, friend]|[3, Mills, Jeff, 80]|  [3, 1, friend]|
|[3, Mills, Jeff, 80]|  [3, 1, friend]|[1, Carter, Derri...|  [1, 3, friend]|
|[5, Banks, Mike, 93]|  [5, 3, friend]|[3, Mills, Jeff, 80]|  [3, 5, friend]|
|[4, Hood, Robert,...|  [4, 3, friend]|[3, Mills, Jeff, 80]|  [3, 4, friend]|
|[1, Carter, Derri...|  [1, 2, friend]|[2, May, Derrick,...|  [2, 1, friend]|
|[3, Mills, Jeff, 80]|  [3, 4, friend]|[4, Hood, Robert,...|  [4

In [14]:
mutualFriends = g.find("(a)-[]->(b); (b)-[]->(c); (c)-[]->(b); (b)-[]->(a)").dropDuplicates()
mutualFriends.show()

+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|
+--------------------+--------------------+--------------------+
|[4, Hood, Robert,...|[3, Mills, Jeff, 80]|[4, Hood, Robert,...|
|[3, Mills, Jeff, 80]|[1, Carter, Derri...|[2, May, Derrick,...|
|[5, Banks, Mike, 93]|[3, Mills, Jeff, 80]|[1, Carter, Derri...|
|[4, Hood, Robert,...|[3, Mills, Jeff, 80]|[1, Carter, Derri...|
|[3, Mills, Jeff, 80]|[1, Carter, Derri...|[3, Mills, Jeff, 80]|
|[5, Banks, Mike, 93]|[3, Mills, Jeff, 80]|[5, Banks, Mike, 93]|
|[5, Banks, Mike, 93]|[3, Mills, Jeff, 80]|[4, Hood, Robert,...|
|[1, Carter, Derri...|[2, May, Derrick,...|[1, Carter, Derri...|
|[1, Carter, Derri...|[3, Mills, Jeff, 80]|[4, Hood, Robert,...|
|[2, May, Derrick,...|[1, Carter, Derri...|[3, Mills, Jeff, 80]|
|[1, Carter, Derri...|[3, Mills, Jeff, 80]|[5, Banks, Mike, 93]|
|[1, Carter, Derri...|[3, Mills, Jeff, 80]|[1, Carter, Derri...|
|[99, Page, Allan,...| [9

In [15]:
mutualFriends.filter('a.id == 2 and c.id == 3').show()

+--------------------+--------------------+--------------------+
|                   a|                   b|                   c|
+--------------------+--------------------+--------------------+
|[2, May, Derrick,...|[1, Carter, Derri...|[3, Mills, Jeff, 80]|
+--------------------+--------------------+--------------------+



In [16]:
g.triangleCount().show()

+-----+---+------+---------+---+
|count| id|  name|firstname|age|
+-----+---+------+---------+---+
|    2|  3| Mills|     Jeff| 80|
|    0| 98|  Berg|      Tim| 28|
|    0| 99|  Page|    Allan| 16|
|    1|  5| Banks|     Mike| 93|
|    1|  1|Carter|  Derrick| 50|
|    1|  4|  Hood|   Robert| 65|
|    1|  2|   May|  Derrick| 26|
+-----+---+------+---------+---+



In [17]:
pr = g.pageRank(resetProbability=0.15, tol=0.01)
## look at the pagerank score for every vertex
pr.vertices.show()
## look at the weight of every edge
pr.edges.show()

+---+------+---------+---+------------------+
| id|  name|firstname|age|          pagerank|
+---+------+---------+---+------------------+
|  1|Carter|  Derrick| 50|0.9055074972891308|
|  3| Mills|     Jeff| 80| 1.853919642738813|
|  2|   May|  Derrick| 26|0.5377967999474921|
|  4|  Hood|   Robert| 65|0.6873519241384106|
| 98|  Berg|      Tim| 28|1.0225331112091938|
|  5| Banks|     Mike| 93|0.9703579134677663|
| 99|  Page|    Allan| 16|1.0225331112091938|
+---+------+---------+---+------------------+

+---+---+-------+------------------+
|src|dst|   type|            weight|
+---+---+-------+------------------+
|  1|  2| friend|               0.5|
| 99| 98| friend|               1.0|
|  1|  3| friend|               0.5|
|  4|  5|follows|               0.5|
|  5|  3| friend|               1.0|
| 98| 99| friend|               1.0|
|  3|  5| friend|0.3333333333333333|
|  4|  3| friend|               0.5|
|  2|  1| friend|               0.5|
|  3|  4| friend|0.3333333333333333|
|  3|  1| fr

In [18]:
#

In [21]:
from pyspark.sql import SQLContext
from pyspark import SparkContext

# # Create DataFrames from HIVE query
# v=sqlCtx.sql("select  id, name, total_seconds from my_schema.nodes")
# e=sqlCtx.sql("select src, dst, relationship from my_schema.edges")

# Create DataFrames manually for testing purposes
v = sqlContext.createDataFrame([
    ("A", "ARON"  ,350 ),
    ("B", "BILL"  ,360 ),
    ("C", "CLAIR" ,195 ),
    ("D", "DANIEL",90),
    ("E", "ERIC"  ,90),
    ("F", "FRANK" ,215 ),
    ("G", "GRAHAM",30 ),
    ("H", "HENRY" ,25 ),
    ("I", "INNA"  ,25 ),
    ("J", "JEN"   ,20 )
], ["id", "name", "total_seconds"])

e=sqlContext.createDataFrame([
    ("A", "B", 60),
    ("B", "A", 50),
    ("A", "C", 50),
    ("C", "A", 100),
    ("A", "D", 90),
    ("C", "I", 25),
    ("C", "J", 20),
    ("B", "F", 50),
    ("F", "B", 110),
    ("F", "G", 30),
    ("F", "H", 25),
    ("B", "E", 90)
],["src","dst","relationship"])