### Graph Frames

In [1]:
import os
import sys

os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya Spark Dataframe app") 

spark = SparkSession.builder.config(conf=conf).appName("natasha pritykovskaya Spark Dataframe app").getOrCreate()

In [None]:
from graphframes import *

In [36]:
vertices = spark.createDataFrame([
    ("1", "Alex", 28, "M", "MIPT"),
    ("2", "Emeli", 28, "F", "MIPT"),
    ("3", "Natasha", 27, "F", "SPbSU"),
    ("4", "Pavel", 30, "M", "MIPT"),
    ("5", "Oleg", 35, "M", "MIPT"),
    ("6", "Ivan", 30, "M", "MSU"),
    ("7", "Ilya", 29, "M", "MSU")], 
["id", "name", "age", "gender", "university"])

In [37]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("2", "1", "friend"),
    ("1", "3", "friend"),
    ("3", "1", "friend"),
    ("1", "4", "friend"),
    ("4", "1", "friend"),
    ("2", "3", "friend"),
    ("3", "2", "friend"),
    ("2", "5", "friend"),
    ("5", "2", "friend"),
    ("3", "4", "friend"),
    ("4", "3", "friend"),
    ("3", "5", "friend"),
    ("5", "3", "friend"),
    ("3", "6", "friend"),
    ("6", "3", "friend"),
    ("3", "7", "friend"),
    ("7", "3", "friend")
], ["src", "dst", "relationship"])

<img src="pics/graph_graphframes.png" width=500/>

In [38]:
g = GraphFrame(vertices, edges)

In [39]:
g.vertices.show()

+---+-------+---+------+----------+
| id|   name|age|gender|university|
+---+-------+---+------+----------+
|  1|   Alex| 28|     M|      MIPT|
|  2|  Emeli| 28|     F|      MIPT|
|  3|Natasha| 27|     F|     SPbSU|
|  4|  Pavel| 30|     M|      MIPT|
|  5|   Oleg| 35|     M|      MIPT|
|  6|   Ivan| 30|     M|       MSU|
|  7|   Ilya| 29|     M|       MSU|
+---+-------+---+------+----------+



In [40]:
g.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  1|  2|      friend|
|  2|  1|      friend|
|  1|  3|      friend|
|  3|  1|      friend|
|  1|  4|      friend|
|  4|  1|      friend|
|  2|  3|      friend|
|  3|  2|      friend|
|  2|  5|      friend|
|  5|  2|      friend|
|  3|  4|      friend|
|  4|  3|      friend|
|  3|  5|      friend|
|  5|  3|      friend|
|  3|  6|      friend|
|  6|  3|      friend|
|  3|  7|      friend|
|  7|  3|      friend|
+---+---+------------+



In [41]:
g.vertices.filter("age > 30").show()

+---+----+---+------+----------+
| id|name|age|gender|university|
+---+----+---+------+----------+
|  5|Oleg| 35|     M|      MIPT|
+---+----+---+------+----------+



In [42]:
g.inDegrees.filter("inDegree >= 2").show(10)

+---+--------+
| id|inDegree|
+---+--------+
|  3|       6|
|  5|       2|
|  1|       3|
|  4|       2|
|  2|       3|
+---+--------+



# Кол-во треугольников

In [43]:
g.triangleCount().show()

+-----+---+-------+---+------+----------+
|count| id|   name|age|gender|university|
+-----+---+-------+---+------+----------+
|    0|  7|   Ilya| 29|     M|       MSU|
|    3|  3|Natasha| 27|     F|     SPbSU|
|    1|  5|   Oleg| 35|     M|      MIPT|
|    0|  6|   Ivan| 30|     M|       MSU|
|    2|  1|   Alex| 28|     M|      MIPT|
|    1|  4|  Pavel| 30|     M|      MIPT|
|    2|  2|  Emeli| 28|     F|      MIPT|
+-----+---+-------+---+------+----------+



### Компоненты связности

In [44]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("1", "4", "friend"),
    ("3", "5", "friend"),
    ("3", "6", "friend"),
], ["src", "dst", "relationship"])

<img src="pics/graph_small_amount_of_links.png" width=500/>

In [45]:
g = GraphFrame(vertices, edges)

In [46]:
sc.setCheckpointDir("/user/natalya.pritykovskaya/")
result = g.connectedComponents()
result.select("id", "component").orderBy("component").show()

+---+-------------+
| id|    component|
+---+-------------+
|  7|  25769803776|
|  3| 154618822656|
|  5| 154618822656|
|  6| 154618822656|
|  1|1236950581248|
|  2|1236950581248|
|  4|1236950581248|
+---+-------------+



In [47]:
g = g.dropIsolatedVertices()

In [48]:
g.vertices.show()

+---+-------+---+------+----------+
| id|   name|age|gender|university|
+---+-------+---+------+----------+
|  3|Natasha| 27|     F|     SPbSU|
|  5|   Oleg| 35|     M|      MIPT|
|  6|   Ivan| 30|     M|       MSU|
|  1|   Alex| 28|     M|      MIPT|
|  4|  Pavel| 30|     M|      MIPT|
|  2|  Emeli| 28|     F|      MIPT|
+---+-------+---+------+----------+



### Page rank

In [49]:
vertices = spark.createDataFrame([
    ("1", "Alex", 28, "M", "MIPT"),
    ("2", "Emeli", 28, "F", "MIPT"),
    ("3", "Natasha", 27, "F", "SPbSU"),
    ("4", "Pavel", 30, "M", "MIPT"),
    ("5", "Oleg", 35, "M", "MIPT"),
    ("6", "Ivan", 30, "M", "MSU"),
    ("7", "Ilya", 29, "M", "MSU")], 
["id", "name", "age", "gender", "university"])

In [50]:
edges = spark.createDataFrame([
    ("1", "2", "friend"),
    ("2", "1", "friend"),
    ("1", "3", "friend"),
    ("3", "1", "friend"),
    ("1", "4", "friend"),
    ("4", "1", "friend"),
    ("2", "3", "friend"),
    ("3", "2", "friend"),
    ("2", "5", "friend"),
    ("5", "2", "friend"),
    ("3", "4", "friend"),
    ("4", "3", "friend"),
    ("3", "5", "friend"),
    ("5", "3", "friend"),
    ("3", "6", "friend"),
    ("6", "3", "friend"),
    ("3", "7", "friend"),
    ("7", "3", "friend")
], ["src", "dst", "relationship"])

In [51]:
g = GraphFrame(vertices, edges)

In [52]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.show()

+---+-------+---+------+----------+------------------+
| id|   name|age|gender|university|          pagerank|
+---+-------+---+------+----------+------------------+
|  1|   Alex| 28|     M|      MIPT| 1.122938128138013|
|  3|Natasha| 27|     F|     SPbSU|2.2125072379360793|
|  2|  Emeli| 28|     F|      MIPT| 1.122938128138013|
|  4|  Pavel| 30|     M|      MIPT|0.7933962989298501|
|  7|   Ilya| 29|     M|       MSU|0.4774119539640973|
|  6|   Ivan| 30|     M|       MSU|0.4774119539640973|
|  5|   Oleg| 35|     M|      MIPT|0.7933962989298501|
+---+-------+---+------+----------+------------------+



In [20]:
results.edges.show()

+---+---+------------+-------------------+
|src|dst|relationship|             weight|
+---+---+------------+-------------------+
|  1|  2|      friend| 0.3333333333333333|
|  2|  5|      friend| 0.3333333333333333|
|  1|  4|      friend| 0.3333333333333333|
|  6|  3|      friend|                1.0|
|  1|  3|      friend| 0.3333333333333333|
|  3|  7|      friend|0.16666666666666666|
|  3|  2|      friend|0.16666666666666666|
|  3|  6|      friend|0.16666666666666666|
|  4|  1|      friend|                0.5|
|  5|  3|      friend|                0.5|
|  3|  5|      friend|0.16666666666666666|
|  5|  2|      friend|                0.5|
|  7|  3|      friend|                1.0|
|  4|  3|      friend|                0.5|
|  2|  1|      friend| 0.3333333333333333|
|  3|  4|      friend|0.16666666666666666|
|  3|  1|      friend|0.16666666666666666|
|  2|  3|      friend| 0.3333333333333333|
+---+---+------------+-------------------+



In [21]:
dir(g)

['DST',
 'ID',
 'SRC',
 '_ATTR',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_edges',
 '_jvm_gf_api',
 '_jvm_graph',
 '_sc',
 '_sqlContext',
 '_vertices',
 'aggregateMessages',
 'bfs',
 'cache',
 'connectedComponents',
 'degrees',
 'dropIsolatedVertices',
 'edges',
 'filterEdges',
 'filterVertices',
 'find',
 'inDegrees',
 'labelPropagation',
 'outDegrees',
 'pageRank',
 'parallelPersonalizedPageRank',
 'persist',
 'pregel',
 'shortestPaths',
 'stronglyConnectedComponents',
 'svdPlusPlus',
 'triangleCount',
 'triplets',
 'unpersist',
 'vertices']

# Положим реальный граф

In [53]:
graphPath = "/lectures/lecture04/trainGraph"
usersToPredictPath = "/lectures/lecture04/prediction.csv"

In [54]:
from pyspark.sql.types import *

schema = StructType(fields=[
    StructField("user", IntegerType()),
    StructField("friendsString", StringType())
])

data = spark.read.format("csv") \
        .schema(schema) \
        .option("delimiter", "\t") \
        .load(graphPath) 

In [55]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, explode, collect_list, sort_array, size, split


def cutStartEndBrackets(s):
    return s[2:-2]

cutStartEndBracketsUDF = udf(cutStartEndBrackets, StringType())

userFriend = \
    data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("dst", split(col("friendMask"), ",")[0])\
    .withColumn("src", col("user"))\
    .select(col("src").cast("integer"), col("dst").cast("integer"))

userFriendSymmetric = \
    userFriend\
        .withColumn("tmp", col("src"))\
        .withColumn("src", col("dst"))\
        .withColumn("dst", col("tmp"))\
        .select("src", "dst")
        
edges = userFriend.unionAll(userFriendSymmetric)

vertices = edges.select(col("src").alias("id")).distinct()

In [56]:
from graphframes import * 

g = GraphFrame(vertices, edges)

In [29]:
g.inDegrees.filter("inDegree > 20").count()

319719

In [33]:
spark.stop()