# Integrating GraphFrames package
![footer_logo_new](images/logo_new.png)

One option is to configure `PYSPARK_SUBMIT_ARGS`. With that all sessions will be initialized with the specified package.

In [None]:
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-memory 2g --packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell'

Better option is to provide a builder configuration option `spark.jars.packages`.

In [None]:
from pyspark import StorageLevel
from pyspark.sql import functions as F, SQLContext, SparkSession, Window
from pyspark.sql.types import *
from random import randint
import time
import datetime

spark = (SparkSession.builder
         .appName("graphframes")
         .master("spark://spark-master:7077")
         .config("spark.jars.packages", "graphframes:graphframes:0.8.1-spark3.0-s_2.12")
         .enableHiveSupport()
         .getOrCreate()
         )

In [None]:
import sys
sys.path

In [None]:
import graphframes

dir(graphframes.graphframe)

In [None]:
from pyspark.sql.functions import *

# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])

In [None]:
from graphframes import *
g = GraphFrame(v, e)

In [None]:
g.inDegrees.show()

In [None]:
g.edges.filter("relationship = 'follow'").count()

In [None]:
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

## Bike Rides

In [None]:
bikeStations = spark.read.option("header",True).csv("data/graphs/station.csv")
bikeStations.printSchema()
tripData = spark.read.option("header",True).csv("data/graphs/trip.csv")
tripData.printSchema()

### Prepare Vertices

In [None]:
stationVertices = bikeStations.distinct()
stationVertices.show(truncate=False)

### Prepare Edges

In [None]:
tripEdges = tripData\
    .withColumnRenamed("start_station_name", "src")\
    .withColumnRenamed("end_station_name", "dst")

tripEdges.show(truncate=False)

### Initialize the GraphFrame

In [None]:
stationGraph = GraphFrame(stationVertices, tripEdges)

### Simple Graph computations

In [None]:
print("Total Number of Stations: " + str(stationGraph.vertices.count()))
print("Total Number of Trips in Graph: " + str(stationGraph.edges.count()))
print("Total Number of Trips in Original Data: " + str(tripData.count()))

### Most popular trips

In [None]:
topTrips = stationGraph\
    .edges\
    .groupBy("src", "dst")\
    .count()\
    .orderBy(desc("count"))\
    .limit(10)

topTrips.show(truncate=False)

### In Degree

In [None]:
inDeg = stationGraph.inDegrees
inDeg\
    .orderBy(desc("inDegree"))\
    .limit(5)\
    .show(truncate=False)

### Out Degree

In [None]:
outDeg = stationGraph.outDegrees
outDeg\
    .orderBy(desc("outDegree"))\
    .limit(5)\
    .show(truncate=False)

### Degree Ratio

In [None]:
degreeRatio = inDeg\
    .join(outDeg, inDeg["id"] == outDeg["id"])\
    .drop(outDeg["id"])\
    .selectExpr("id", "double(inDegree)/double(outDegree) as degreeRatio")

degreeRatio.cache()

#### Descending

In [None]:
degreeRatio\
    .orderBy(desc("degreeRatio"))\
    .limit(10)\
    .show(truncate=False)

#### Ascending

In [None]:
degreeRatio\
    .orderBy(asc("degreeRatio"))\
    .limit(10)\
    .show(truncate=False)

In [None]:
spark.stop()