# Creating graphframe

In [1]:
import pyspark
from delta import configure_spark_with_delta_pip

import pyspark.sql.functions as F
from pyspark.sql.functions import col, array, explode, sum as Fsum
from pyspark.sql.functions import array_intersect, size


# Prepare the Spark builder
builder = pyspark.sql.SparkSession.builder.appName("Graphs_Practice") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder,extra_packages=["graphframes:graphframes:0.8.4-spark3.5-s_2.12"]).getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", spark._sc.defaultParallelism)

spark.conf.set("spark.sql.repl.eagerEval.enabled",True) # OK for exploration, not great for performance
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)
import graphframes as gf



In [3]:
flights_df= spark.read.csv("input/2009.csv", header=True, inferSchema=True)
display(flights_df.limit(10))

FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 27
2009-01-01,XE,1204,DCA,EWR,1100,1058.0,-2.0,18.0,1116.0,1158.0,8.0,1202,1206.0,4.0,0.0,,0.0,62.0,68.0,42.0,199.0,,,,,,
2009-01-01,XE,1206,EWR,IAD,1510,1509.0,-1.0,28.0,1537.0,1620.0,4.0,1632,1624.0,-8.0,0.0,,0.0,82.0,75.0,43.0,213.0,,,,,,
2009-01-01,XE,1207,EWR,DCA,1100,1059.0,-1.0,20.0,1119.0,1155.0,6.0,1210,1201.0,-9.0,0.0,,0.0,70.0,62.0,36.0,199.0,,,,,,
2009-01-01,XE,1208,DCA,EWR,1240,1249.0,9.0,10.0,1259.0,1336.0,9.0,1357,1345.0,-12.0,0.0,,0.0,77.0,56.0,37.0,199.0,,,,,,
2009-01-01,XE,1209,IAD,EWR,1715,1705.0,-10.0,24.0,1729.0,1809.0,13.0,1900,1822.0,-38.0,0.0,,0.0,105.0,77.0,40.0,213.0,,,,,,
2009-01-01,XE,1212,ATL,EWR,1915,1913.0,-2.0,19.0,1932.0,2108.0,15.0,2142,2123.0,-19.0,0.0,,0.0,147.0,130.0,96.0,745.0,,,,,,
2009-01-01,XE,1212,CLE,ATL,1645,1637.0,-8.0,12.0,1649.0,1820.0,5.0,1842,1825.0,-17.0,0.0,,0.0,117.0,108.0,91.0,554.0,,,,,,
2009-01-01,XE,1214,DCA,EWR,1915,1908.0,-7.0,9.0,1917.0,1953.0,34.0,2035,2027.0,-8.0,0.0,,0.0,80.0,79.0,36.0,199.0,,,,,,
2009-01-01,XE,1215,EWR,DCA,1715,1710.0,-5.0,28.0,1738.0,1819.0,4.0,1838,1823.0,-15.0,0.0,,0.0,83.0,73.0,41.0,199.0,,,,,,
2009-01-01,XE,1217,EWR,DCA,1300,1255.0,-5.0,15.0,1310.0,1349.0,7.0,1408,1356.0,-12.0,0.0,,0.0,68.0,61.0,39.0,199.0,,,,,,


In [4]:

origin_airports = flights_df.select(F.col("ORIGIN").alias("id"))
dest_airports = flights_df.select(F.col("DEST").alias("id"))

vertices = origin_airports.union(dest_airports).distinct() # find set of all airports from either dest or origin column
#vertices.show()             

In [5]:
edges = flights_df.select(
    F.col("ORIGIN").alias("src"),
    F.col("DEST").alias("dst")
).distinct() # without distinct, triangles do not make much sense?

#edges.show()


In [6]:
flights_graph = gf.GraphFrame(vertices, edges)
vertices.cache()
edges.cache()
display(flights_graph)

GraphFrame(v:[id: string], e:[src: string, dst: string])

# Query 1

In [6]:
#degree calculations:

in_degree = edges.groupBy("dst").count() \
    .withColumnRenamed("dst", "id") \
    .withColumnRenamed("count", "inDegree")

out_degree = edges.groupBy("src").count() \
    .withColumnRenamed("src", "id") \
    .withColumnRenamed("count", "outDegree")

total_degrees = in_degree.join(out_degree, "id", "outer") \
    .withColumn("totalDegree", F.col("inDegree") + F.col("outDegree"))

In [7]:
#triangles 
undirected_edges = edges.union(edges.selectExpr("dst as src", "src as dst")).distinct() # default counter assumes graph is undirected

neighbours = undirected_edges.groupBy("src").agg(F.collect_set("dst").alias("neighbors")) # find a set of all options to go from airport
#neighbours.show()
neighbours.cache()

joined = undirected_edges \
    .join(neighbours, on="src") \
    .join(
        neighbours.withColumnRenamed("src", "dst").withColumnRenamed("neighbors", "dst_neighbors"),
        on="dst"
    )
#joined.select("src", "dst", "neighbors", "dst_neighbors").show()

with_common = joined.withColumn(
    "common_neighbors", size(array_intersect("neighbors", "dst_neighbors"))
)
#with_common.select("src", "dst", "common_neighbors").show()
triangle_counts = with_common.groupBy("src").agg(
    (F.sum("common_neighbors") / 2).cast("int").alias("manual_count")
)
triangle_counts = triangle_counts.withColumnRenamed("src", "id")
triangle_counts = triangle_counts.withColumnRenamed("manual_count", "count") # to match in assert

#triangle_counts.show()


In [8]:
# USED FOR VALIDATION ONLY: build in functions to verify results:
to_test = True # can be changed to false to skip testing
if(to_test):
    in_degrees = flights_graph.inDegrees
    out_degrees = flights_graph.outDegrees
    assert(sorted(in_degree.collect()) == sorted(in_degrees.collect()))
    assert(sorted(out_degree.collect()) == sorted(out_degrees.collect()))
    triangles = flights_graph.triangleCount().select("id","count") # reorder columns to pass assert
    assert(sorted(triangle_counts.collect()) == sorted(triangles.collect()))

In [9]:
triangle_counts.show()
triangles.show()
answer = total_degrees.join(triangle_counts, "id", "left")
answer.show()

+---+-----+
| id|count|
+---+-----+
|DCA|  673|
|EWR| 1165|
|ATL| 1761|
|ORD| 1678|
|JAN|   36|
|IAH| 1338|
|OKC|  165|
|ABQ|  311|
|IND|  612|
|COS|   45|
|GSO|   54|
|ELP|   97|
|MFE|    3|
|OMA|  142|
|DAL|   73|
|GRR|   96|
|JAX|  342|
|CLT| 1049|
|GSP|   54|
|BTR|    6|
+---+-----+
only showing top 20 rows

+---+-----+
| id|count|
+---+-----+
|DCA|  673|
|EWR| 1165|
|ATL| 1761|
|ORD| 1678|
|LFT|    6|
|BWI| 1003|
|GSO|   54|
|JAN|   36|
|IAH| 1338|
|JAX|  342|
|OKC|  165|
|CHS|   90|
|ABQ|  311|
|IND|  612|
|COS|   45|
|CLT| 1049|
|CVG| 1261|
|ELP|   97|
|GRR|   96|
|CMH|  392|
+---+-----+
only showing top 20 rows

+---+--------+---------+-----------+-----+
| id|inDegree|outDegree|totalDegree|count|
+---+--------+---------+-----------+-----+
|ABQ|      32|       31|         63|  311|
|ABY|       1|        1|          2|    0|
|ACK|       2|        1|          3|    1|
|ACV|       5|        4|          9|    6|
|ADK|       1|        1|          2|    0|
|ADQ|       1|        1|    

# Query 2

In [10]:
count = triangle_counts.agg((F.sum("count") /3).alias("total num. of triangles")) # sum all and divide by 3 as each triangle is made of 3 airports counting it
count.show()

+-----------------------+
|total num. of triangles|
+-----------------------+
|                16015.0|
+-----------------------+



# Query 3

In [8]:
flights_pagerank = flights_graph.pageRank(resetProbability=0.15, maxIter=10)
display(flights_pagerank.vertices.orderBy("pagerank", ascending=False))
# display(flights_pagerank.edges)

id,pagerank
ATL,11.335268868250692
DFW,8.987200141119203
ORD,8.330832084011147
DTW,7.722346489398374
MSP,7.59298573802231
DEN,7.026287164606544
SLC,6.7644947814875325
IAH,5.6623216057564845
LAX,5.070949815830091
CVG,4.856791376084717
