### Setup

In [1]:
import sys
import pyspark.sql.functions as F
from graphframes import GraphFrame
from graphframes.examples import Graphs

In [2]:
print(sys.version)

3.6.8 (default, Aug  2 2019, 17:42:44) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-28)]


In [3]:
spark

In [4]:
# Set the checkpoint directory
sc.setCheckpointDir('/tmp')

### Data Import and Cleaning

The data analyzed here contains flight information for all domestic flights from 1987-2012. The raw data contains both airline and airport codes, and so additional lookup tables are imported to make results human-readable

In [5]:
# Read in airline codes mapping
airline_codes = spark.read.csv('s3://jornaya-ds-us-east-1-sandbox/csnyder/acname.csv', header=True) \
    .selectExpr('carrier_name', 'IATA as UNIQUE_CARRIER')
# Read in airport codes mapping
airport_codes = spark.read.csv('s3://jornaya-ds-us-east-1-sandbox/csnyder/airport_codes.csv', header=True)

In [6]:
# Read in the airline linkage data, select only subset of columns, join airline codes
air = spark.read.parquet('s3://jornaya-ds-us-east-1-sandbox/csnyder/graph_analytics_at_scale_parquet') \
    .select('YEAR', 'UNIQUE_CARRIER', 'FL_NUM', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_STATE_ABR',
            'CANCELLED', 'CARRIER_DELAY', 'WEATHER_DELAY') \
    .join(F.broadcast(airline_codes), on='UNIQUE_CARRIER', how='inner')

In [7]:
# delays are in minutes and are null when there is no delay. Fill nulls as 0.
air = air.na.fill({'CARRIER_DELAY': 0, 'WEATHER_DELAY': 0})

In [8]:
air.count()

147370411

In [9]:
air.show(5)

+--------------+----+------+------+----------------+----+--------------+---------+-------------+-------------+--------------------+
|UNIQUE_CARRIER|YEAR|FL_NUM|ORIGIN|ORIGIN_STATE_ABR|DEST|DEST_STATE_ABR|CANCELLED|CARRIER_DELAY|WEATHER_DELAY|        carrier_name|
+--------------+----+------+------+----------------+----+--------------+---------+-------------+-------------+--------------------+
|            CO|1989|  1059|   IAH|              TX| MCI|            MO|     0.00|            0|            0|Continental Airli...|
|            DL|1989|   224|   DFW|              TX| EWR|            NJ|     0.00|            0|            0| Delta Air Lines Inc|
|            NW|1989|   944|   MEM|              TN| MIA|            FL|     0.00|            0|            0|Northwest Airline...|
|            AA|1989|  1255|   IND|              IN| BNA|            TN|     0.00|            0|            0|American Airlines...|
|            DL|1989|   611|   ATL|              GA| TPA|            FL|    

#### Quick Summaries

Carriers with the most flights...

In [10]:
air \
    .groupBy('UNIQUE_CARRIER','carrier_name') \
    .count() \
    .orderBy(F.desc('count')) \
    .show(5, truncate=False)

+--------------+---------------------+--------+
|UNIQUE_CARRIER|carrier_name         |count   |
+--------------+---------------------+--------+
|WN            |Southwest Airlines Co|20529039|
|DL            |Delta Air Lines Inc  |19168060|
|AA            |American Airlines Inc|17140606|
|US            |Piedmont Airlines Inc|15709733|
|US            |US Airways Inc       |15709733|
+--------------+---------------------+--------+
only showing top 5 rows



Carriers with the least amount of flights...

In [11]:
air \
    .groupBy('UNIQUE_CARRIER', 'carrier_name') \
    .count() \
    .orderBy('count') \
    .show(5, truncate=False)

+--------------+----------------------------------+------+
|UNIQUE_CARRIER|carrier_name                      |count |
+--------------+----------------------------------+------+
|VX            |Virgin America Inc                |54742 |
|PS            |Ukraine International Airlines    |83617 |
|KH            |Aeko Kula Inc. DBA Aloha Air Cargo|154381|
|HA            |Hawaiian Airlines Inc             |555683|
|F9            |Frontier Airlines Inc             |670653|
+--------------+----------------------------------+------+
only showing top 5 rows



Carriers who had the highest flight delays (in total minutes) in 2012...

In [12]:
air \
    .filter('year = 2012') \
    .groupBy('carrier_name') \
    .agg(
        F.sum('CARRIER_DELAY').alias('total_minutes_delayed'),
        F.count('*').alias('flight_cnt'),
        F.avg(F.col('CARRIER_DELAY')).alias('avg_delay'),
        F.stddev(F.col('CARRIER_DELAY')).alias('std_delay')
    ) \
    .orderBy(F.desc('std_delay')) \
    .show(10, truncate=False)

+-------------------------------+---------------------+----------+------------------+------------------+
|carrier_name                   |total_minutes_delayed|flight_cnt|avg_delay         |std_delay         |
+-------------------------------+---------------------+----------+------------------+------------------+
|American Airlines Inc          |2140405.0            |525220    |4.075254179201097 |25.38504632379856 |
|Hawaiian Airlines Inc          |158544.0             |74109     |2.1393353034044447|24.449497095797234|
|Atlantic Southeast Airlines Inc|3441656.0            |740855    |4.645519028689825 |23.542207249896517|
|Delta Air Lines Inc            |1965386.0            |726879    |2.7038695573816276|21.81374694498361 |
|United Airlines Inc            |2117628.0            |531245    |3.9861608109252793|20.40891087487564 |
|Frontier Airlines Inc          |217305.0             |79255     |2.7418459403192226|16.569690679160402|
|Mesa Airlines Inc              |314822.0             |

### Creating the GraphFrame Object

In [13]:
# Edges
e = air.selectExpr('ORIGIN as src', 'DEST as dst', 'UNIQUE_CARRIER as carrier', 'YEAR as year') \
    .filter('src is not null') \
    .filter('dst is not null')
# Vertices
v0 = e.selectExpr('src as id').union(e.selectExpr('dst as id')).distinct()
v = v0.join(F.broadcast(airport_codes), v0.id==airport_codes.airport_code, how='inner')

In [14]:
e.explain()

== Physical Plan ==
*(2) Project [ORIGIN#34 AS src#300, DEST#36 AS dst#301, UNIQUE_CARRIER#32 AS carrier#302, YEAR#31 AS year#303]
+- *(2) BroadcastHashJoin [UNIQUE_CARRIER#32], [UNIQUE_CARRIER#14], Inner, BuildRight
   :- *(2) Project [YEAR#31, UNIQUE_CARRIER#32, ORIGIN#34, DEST#36]
   :  +- *(2) Filter ((isnotnull(ORIGIN#34) && isnotnull(DEST#36)) && isnotnull(UNIQUE_CARRIER#32))
   :     +- *(2) FileScan parquet [YEAR#31,UNIQUE_CARRIER#32,ORIGIN#34,DEST#36] Batched: true, Format: Parquet, Location: InMemoryFileIndex[s3://jornaya-ds-us-east-1-sandbox/csnyder/graph_analytics_at_scale_parquet], PartitionFilters: [], PushedFilters: [IsNotNull(ORIGIN), IsNotNull(DEST), IsNotNull(UNIQUE_CARRIER)], ReadSchema: struct<YEAR:string,UNIQUE_CARRIER:string,ORIGIN:string,DEST:string>
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
      +- *(1) Project [IATA#11 AS UNIQUE_CARRIER#14]
         +- *(1) Filter isnotnull(IATA#11)
            +- *(1) FileScan csv [IATA

Checkpointing the data speeds up subsequent queries by caching the data to disk and also by truncating the query plan

In [15]:
e = e.checkpoint()
v = v.checkpoint()

In [16]:
e.explain()

== Physical Plan ==
Scan ExistingRDD[src#300,dst#301,carrier#302,year#303]


In [17]:
# Create the Graphframe object
g = GraphFrame(v, e)

In [18]:
g.vertices.show(5)

+---+--------------------+------------+
| id|        airport_name|airport_code|
+---+--------------------+------------+
|BUR|             Burbank|         BUR|
|EUG|              Eugene|         EUG|
|PVD|Providence - T.F....|         PVD|
|DCA|Washington Nation...|         DCA|
|EVV|          Evansville|         EVV|
+---+--------------------+------------+
only showing top 5 rows



In [19]:
g.edges.show(5)

+---+---+-------+----+
|src|dst|carrier|year|
+---+---+-------+----+
|IAH|MCI|     CO|1989|
|DFW|EWR|     DL|1989|
|MEM|MIA|     NW|1989|
|IND|BNA|     AA|1989|
|ATL|TPA|     DL|1989|
+---+---+-------+----+
only showing top 5 rows



### InDegrees/OutDegrees

In [20]:
%%time
g.inDegrees \
    .join(F.broadcast(airport_codes), g.inDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'inDegree') \
    .orderBy(F.desc('inDegree')) \
    .show(5, truncate=False)

+--------------------------------------------+---+--------+
|airport_name                                |id |inDegree|
+--------------------------------------------+---+--------+
|Atlanta Hartsfield International Airport    |ATL|7728136 |
|Chicago O'Hare International Airport Airport|ORD|6908830 |
|Dallas/Fort Worth International Airport     |DFW|5812248 |
|Charlotte/Douglas International Airport     |CLT|5150264 |
|Los Angeles International Airport           |LAX|4693998 |
+--------------------------------------------+---+--------+
only showing top 5 rows

CPU times: user 5.44 ms, sys: 0 ns, total: 5.44 ms
Wall time: 8.39 s


In [21]:
%%time
g.inDegrees \
    .join(F.broadcast(airport_codes), g.inDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'inDegree') \
    .orderBy('inDegree') \
    .show(5, truncate=False)

+---------------------------------+---+--------+
|airport_name                     |id |inDegree|
+---------------------------------+---+--------+
|Western Nebraska Regional Airport|BFF|3       |
|Four Corners Regional Airport    |FMN|5       |
|Clarksburg                       |CKB|6       |
|Cheyenne                         |CYS|12      |
|Pueblo                           |PUB|14      |
+---------------------------------+---+--------+
only showing top 5 rows

CPU times: user 5.5 ms, sys: 0 ns, total: 5.5 ms
Wall time: 5.87 s


In [22]:
%%time
g.outDegrees \
    .join(F.broadcast(airport_codes), g.outDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'outDegree') \
    .orderBy(F.desc('outDegree')) \
    .show(5, truncate=False)

+--------------------------------------------+---+---------+
|airport_name                                |id |outDegree|
+--------------------------------------------+---+---------+
|Atlanta Hartsfield International Airport    |ATL|7735252  |
|Chicago O'Hare International Airport Airport|ORD|6868280  |
|Dallas/Fort Worth International Airport     |DFW|5778054  |
|Charlotte/Douglas International Airport     |CLT|5139550  |
|Los Angeles International Airport           |LAX|4696246  |
+--------------------------------------------+---+---------+
only showing top 5 rows

CPU times: user 5.19 ms, sys: 1.16 ms, total: 6.35 ms
Wall time: 7.51 s


In [23]:
%%time
g.outDegrees \
    .join(F.broadcast(airport_codes), g.outDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'outDegree') \
    .orderBy('outDegree') \
    .show(5, truncate=False)

+---------------------------------+---+---------+
|airport_name                     |id |outDegree|
+---------------------------------+---+---------+
|Western Nebraska Regional Airport|BFF|1        |
|Cheyenne                         |CYS|2        |
|Four Corners Regional Airport    |FMN|3        |
|Pueblo                           |PUB|5        |
|Clarksburg                       |CKB|8        |
+---------------------------------+---+---------+
only showing top 5 rows

CPU times: user 5.37 ms, sys: 285 µs, total: 5.65 ms
Wall time: 7.17 s


### SubGraphs

Let's take a look at only American Airlines data.

In [24]:
gAA = g.filterEdges("carrier = 'AA'")

What are the most common airports that American Airlines flights fly into?

In [25]:
%%time
gAA_inD_all = gAA.inDegrees \
    .join(F.broadcast(airport_codes), gAA.inDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'inDegree')

inflightsum = gAA_inD_all.groupBy().sum('InDegree').collect()[0][0]

gAA_inD_all.withColumn('pct', F.col('inDegree') / inflightsum) \
    .orderBy(F.desc('inDegree')) \
    .show(5, truncate=False)

+--------------------------------------------+---+--------+--------------------+
|airport_name                                |id |inDegree|pct                 |
+--------------------------------------------+---+--------+--------------------+
|Dallas/Fort Worth International Airport     |DFW|3964518 |0.23744946473042466 |
|Chicago O'Hare International Airport Airport|ORD|2323124 |0.1391403823371222  |
|Miami International Airport                 |MIA|865459  |0.05183550088462925 |
|Los Angeles International Airport           |LAX|680257  |0.04074307659320111 |
|New York La Guardia Airport                 |LGA|442156  |0.026482337960716952|
+--------------------------------------------+---+--------+--------------------+
only showing top 5 rows

CPU times: user 12.5 ms, sys: 963 µs, total: 13.5 ms
Wall time: 12.6 s


What are the most common airports that American Airlines flights fly into in 2012?

In [26]:
%%time
gAA_2012 = gAA.filterEdges("year = '2012'")

gAA_inD_2012 = gAA_2012.inDegrees \
    .join(F.broadcast(airport_codes), gAA_2012.inDegrees.id==airport_codes.airport_code, how='inner') \
    .select('airport_name', 'id', 'inDegree')

inflightsum_2012 = gAA_inD_2012.groupBy().sum('InDegree').collect()[0][0]

gAA_inD_2012.withColumn('pct', F.col('inDegree') / inflightsum_2012) \
    .orderBy(F.desc('inDegree')) \
    .show(5, truncate=False)

+--------------------------------------------+---+--------+-------------------+
|airport_name                                |id |inDegree|pct                |
+--------------------------------------------+---+--------+-------------------+
|Dallas/Fort Worth International Airport     |DFW|152936  |0.2967654552404903 |
|Chicago O'Hare International Airport Airport|ORD|50360   |0.09772132346805913|
|Miami International Airport                 |MIA|46032   |0.0893230333971743 |
|Los Angeles International Airport           |LAX|29833   |0.05788959974230755|
|New York La Guardia Airport                 |LGA|15672   |0.03041081376869386|
+--------------------------------------------+---+--------+-------------------+
only showing top 5 rows

CPU times: user 14.5 ms, sys: 1.38 ms, total: 15.9 ms
Wall time: 14.9 s


### Algorithm Tour

#### Breadth-First Search (BFS)

In [27]:
%%time
g_bfs = g.filterEdges('year = 2012') \
    .bfs('id="PHL"', 'id="HNL"', maxPathLength=2)
g_bfs = g_bfs.checkpoint()

CPU times: user 13.5 ms, sys: 0 ns, total: 13.5 ms
Wall time: 1min 11s


In [76]:
g_bfsd = g_bfs.distinct()
g_bfsd = g_bfsd.checkpoint()

In [77]:
g_bfsd.drop('v1').show(10,truncate=False)

+------------------------+--------------------+--------------------+------------------------------------------+
|from                    |e0                  |e1                  |to                                        |
+------------------------+--------------------+--------------------+------------------------------------------+
|[PHL, Philadelphia, PHL]|[PHL, ORD, AA, 2012]|[ORD, HNL, UA, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, ORD, US, 2012]|[ORD, HNL, AA, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, LAX, DL, 2012]|[LAX, HNL, AA, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, PHX, WN, 2012]|[PHX, HNL, US, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, DEN, WN, 2012]|[DEN, HNL, UA, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, DFW, US, 2012]|[DFW, HNL, AA, 2012]|[HNL, Honolulu International Airport

In [86]:
g_bfsd.drop('v1') \
    .filter(g_bfsd.e0['carrier'] == 'AA') \
    .filter(g_bfsd.e1['carrier'] == 'AA') \
    .show(10,truncate=False)

+------------------------+--------------------+--------------------+------------------------------------------+
|from                    |e0                  |e1                  |to                                        |
+------------------------+--------------------+--------------------+------------------------------------------+
|[PHL, Philadelphia, PHL]|[PHL, DFW, AA, 2012]|[DFW, HNL, AA, 2012]|[HNL, Honolulu International Airport, HNL]|
|[PHL, Philadelphia, PHL]|[PHL, ORD, AA, 2012]|[ORD, HNL, AA, 2012]|[HNL, Honolulu International Airport, HNL]|
+------------------------+--------------------+--------------------+------------------------------------------+



In [79]:
g_bfsd.count()

66

#### Connected Components

Let's run connected components on the entire dataset and see what happens.

In [30]:
%%time
g_cc = g.connectedComponents()
g_cc = g_cc.checkpoint()

CPU times: user 10.9 ms, sys: 1.53 ms, total: 12.4 ms
Wall time: 1min 9s


In [31]:
g_cc.printSchema()

root
 |-- id: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- airport_code: string (nullable = true)
 |-- component: long (nullable = true)



In [32]:
g_cc.groupBy('component').count().orderBy(F.desc('component')).show()

+-----------+-----+
|  component|count|
+-----------+-----+
|34359738368|  200|
+-----------+-----+



Not that interesting, but also expected. You would think that you can get to any airport from any airport in the US. Let's create a subgraph using just Virgin America (VX) flight data.

In [33]:
%%time
gVX = g.filterEdges('carrier = "VX"')
gVX_cc = gVX.connectedComponents()
gVX_cc = gVX_cc.checkpoint()

CPU times: user 7.63 ms, sys: 815 µs, total: 8.45 ms
Wall time: 32.9 s


In [34]:
gVX_cc.show(5)

+---+--------------------+------------+------------+
| id|        airport_name|airport_code|   component|
+---+--------------------+------------+------------+
|BUR|             Burbank|         BUR| 34359738368|
|EUG|              Eugene|         EUG| 85899345920|
|PVD|Providence - T.F....|         PVD|103079215104|
|DCA|Washington Nation...|         DCA|171798691840|
|EVV|          Evansville|         EVV|274877906944|
+---+--------------------+------------+------------+
only showing top 5 rows



In [35]:
gVX_cc.groupBy('component').count().orderBy(F.desc('count')).show(5)

+-------------+-----+
|    component|count|
+-------------+-----+
| 171798691840|   16|
|2310692405249|    1|
|6811818131456|    1|
|3504693313536|    1|
|6665789243392|    1|
+-------------+-----+
only showing top 5 rows



In [36]:
gVX_cc.filter('component = 171798691840') \
    .show(truncate=False)

+---+--------------------------------------------------+------------+------------+
|id |airport_name                                      |airport_code|component   |
+---+--------------------------------------------------+------------+------------+
|DCA|Washington National Airport                       |DCA         |171798691840|
|LAS|Las Vegas Las Vegas McCarran International Airport|LAS         |171798691840|
|LAX|Los Angeles International Airport                 |LAX         |171798691840|
|JFK|New York John F Kennedy International Airport     |JFK         |171798691840|
|BOS|Boston Logan International Airport                |BOS         |171798691840|
|SEA|Seattle Tacoma International Airport              |SEA         |171798691840|
|DFW|Dallas/Fort Worth International Airport           |DFW         |171798691840|
|ORD|Chicago O'Hare International Airport Airport      |ORD         |171798691840|
|SAN|San Diego                                         |SAN         |171798691840|
|PHL

#### Label Propagation Algorithm (LPA)

In [37]:
g_lpa = g.labelPropagation(maxIter=5)
g_lpa = g_lpa.checkpoint()

In [38]:
g_lpa.printSchema()

root
 |-- id: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- airport_code: string (nullable = true)
 |-- label: long (nullable = true)



In [39]:
g_lpa.show(5)

+---+--------------------+------------+-------------+
| id|        airport_name|airport_code|        label|
+---+--------------------+------------+-------------+
|MSY|New Orleans Inter...|         MSY|4655744548864|
|ROR|       Koror Airport|         ROR|1408749273088|
|FLG|           Flagstaff|         FLG|1589137899521|
|ABI|Abilene Regional ...|         ABI|4655744548864|
|GRB|           Green Bay|         GRB|4655744548864|
+---+--------------------+------------+-------------+
only showing top 5 rows



In [40]:
g_lpa.groupBy('label').count().orderBy(F.desc('count')).show(5)

+-------------+-----+
|        label|count|
+-------------+-----+
|4655744548864|  147|
|1589137899521|   37|
|5583457484800|    4|
|1408749273088|    3|
|4363686772736|    2|
+-------------+-----+
only showing top 5 rows



In [45]:
g_lpa.filter('label = 4655744548864') \
    .show(10, truncate=False)

+---+---------------------------------------+------------+-------------+
|id |airport_name                           |airport_code|label        |
+---+---------------------------------------+------------+-------------+
|MSY|New Orleans International Airport      |MSY         |4655744548864|
|ABI|Abilene Regional Airport               |ABI         |4655744548864|
|GRB|Green Bay                              |GRB         |4655744548864|
|HRL|Valley International Airport           |HRL         |4655744548864|
|MOB|Mobile                                 |MOB         |4655744548864|
|ISP|Islip                                  |ISP         |4655744548864|
|DFW|Dallas/Fort Worth International Airport|DFW         |4655744548864|
|PIA|Peoria                                 |PIA         |4655744548864|
|DEN|Denver International Airport           |DEN         |4655744548864|
|EVV|Evansville                             |EVV         |4655744548864|
+---+---------------------------------------+------

In [46]:
g_lpa.filter('label = 1589137899521') \
    .show(10, truncate=False)

+---+------------------------------------+------------+-------------+
|id |airport_name                        |airport_code|label        |
+---+------------------------------------+------------+-------------+
|FLG|Flagstaff                           |FLG         |1589137899521|
|LAX|Los Angeles International Airport   |LAX         |1589137899521|
|BUR|Burbank                             |BUR         |1589137899521|
|SLE|Salem                               |SLE         |1589137899521|
|SEA|Seattle Tacoma International Airport|SEA         |1589137899521|
|ABQ|Albuquerque International Airport   |ABQ         |1589137899521|
|FAI|Fairbanks International Airport     |FAI         |1589137899521|
|FAT|Fresno                              |FAT         |1589137899521|
|GST|Gustavus Airport                    |GST         |1589137899521|
|SJC|San Jose                            |SJC         |1589137899521|
+---+------------------------------------+------------+-------------+
only showing top 10 

In [47]:
g_lpa.filter('label = 5583457484800') \
    .show(10, truncate=False)

+---+------------+------------+-------------+
|id |airport_name|airport_code|label        |
+---+------------+------------+-------------+
|ITO|Hilo        |ITO         |5583457484800|
|KOA|Kailua      |KOA         |5583457484800|
|LIH|Lihue       |LIH         |5583457484800|
|OGG|Kahului     |OGG         |5583457484800|
+---+------------+------------+-------------+



In [48]:
g_lpa.filter('label = 1408749273088') \
    .show(10, truncate=False)

+---+----------------------------+------------+-------------+
|id |airport_name                |airport_code|label        |
+---+----------------------------+------------+-------------+
|ROR|Koror Airport               |ROR         |1408749273088|
|ROP|Rota International Airport  |ROP         |1408749273088|
|SPN|Saipan International Airport|SPN         |1408749273088|
+---+----------------------------+------------+-------------+



#### PageRank

In [64]:
%%time
g_pr = g.pageRank(resetProbability=0.15, tol=0.01)
g_pr_v = g_pr.vertices.checkpoint()

CPU times: user 23.9 ms, sys: 0 ns, total: 23.9 ms
Wall time: 2min 4s


In [53]:
g_pr_v.orderBy(F.desc('pagerank')).show(5, truncate=False)

+---+--------------------------------------------+------------+------------------+
|id |airport_name                                |airport_code|pagerank          |
+---+--------------------------------------------+------------+------------------+
|ATL|Atlanta Hartsfield International Airport    |ATL         |9.434314446171609 |
|ORD|Chicago O'Hare International Airport Airport|ORD         |8.05501568394736  |
|DFW|Dallas/Fort Worth International Airport     |DFW         |6.464585821208273 |
|CLT|Charlotte/Douglas International Airport     |CLT         |5.874877360337312 |
|DEN|Denver International Airport                |DEN         |5.4595071702127775|
+---+--------------------------------------------+------------+------------------+
only showing top 5 rows



In [65]:
%%time
g_pr_10 = g.pageRank(resetProbability=0.15, maxIter=5)
g_pr_10_v = g_pr_10.vertices.checkpoint()

CPU times: user 8.08 ms, sys: 141 µs, total: 8.22 ms
Wall time: 30.8 s


In [54]:
g_pr_10_v.orderBy(F.desc('pagerank')).show(5, truncate=False)

+---+--------------------------------------------+------------+-----------------+
|id |airport_name                                |airport_code|pagerank         |
+---+--------------------------------------------+------------+-----------------+
|ATL|Atlanta Hartsfield International Airport    |ATL         |9.92198975167278 |
|ORD|Chicago O'Hare International Airport Airport|ORD         |8.337957724192721|
|DFW|Dallas/Fort Worth International Airport     |DFW         |6.621507021383636|
|CLT|Charlotte/Douglas International Airport     |CLT         |6.133078023243028|
|DEN|Denver International Airport                |DEN         |5.585645053345459|
+---+--------------------------------------------+------------+-----------------+
only showing top 5 rows



In [66]:
%%time
g_pr_ORD = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="ORD")
g_pr_ORD_v = g_pr_ORD.vertices.checkpoint()

CPU times: user 10.1 ms, sys: 752 µs, total: 10.8 ms
Wall time: 48.7 s


In [55]:
g_pr_ORD_v.orderBy(F.desc('pagerank')).show(5, truncate=False)

+---+--------------------------------------------+------------+--------------------+
|id |airport_name                                |airport_code|pagerank            |
+---+--------------------------------------------+------------+--------------------+
|ORD|Chicago O'Hare International Airport Airport|ORD         |0.18818840558163535 |
|ATL|Atlanta Hartsfield International Airport    |ATL         |0.041246827043181576|
|DFW|Dallas/Fort Worth International Airport     |DFW         |0.03318226313700795 |
|CLT|Charlotte/Douglas International Airport     |CLT         |0.028625402842400737|
|LAX|Los Angeles International Airport           |LAX         |0.026280308465150597|
+---+--------------------------------------------+------------+--------------------+
only showing top 5 rows



In [67]:
%%time
g_pr_parellel = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["ORD", "ROP", "PHL"], maxIter=10)
g_pr_parellel_v = g_pr_parellel.vertices.checkpoint()

CPU times: user 9.43 ms, sys: 4.17 ms, total: 13.6 ms
Wall time: 1min 2s


In [63]:
g_pr_parellel_v.orderBy(F.desc('pageranks')).drop('id').show(5, truncate=False)

+--------------------------------------------+------------+----------------------------------------------------------------+
|airport_name                                |airport_code|pageranks                                                       |
+--------------------------------------------+------------+----------------------------------------------------------------+
|Chicago O'Hare International Airport Airport|ORD         |[0.18818840558163524,0.008767609606109426,0.04383591685089005]  |
|Atlanta Hartsfield International Airport    |ATL         |[0.04124682704318155,0.006536540689875266,0.045808350454769525] |
|Dallas/Fort Worth International Airport     |DFW         |[0.03318226313700794,0.008252830023793687,0.0314962640330986]   |
|Charlotte/Douglas International Airport     |CLT         |[0.028625402842400727,0.0038911847897854937,0.03405932281634526]|
|Los Angeles International Airport           |LAX         |[0.02628030846515059,0.019928008470075413,0.022929362677678013] |


#### Shortest Path

In [57]:
%%time
g_sp = g.shortestPaths(landmarks=["PHL"])
g_sp = g_sp.checkpoint()

CPU times: user 7.29 ms, sys: 579 µs, total: 7.87 ms
Wall time: 36.1 s


In [58]:
g_sp.show(5)

+---+--------------------+------------+----------+
| id|        airport_name|airport_code| distances|
+---+--------------------+------------+----------+
|MSY|New Orleans Inter...|         MSY|[PHL -> 1]|
|ROR|       Koror Airport|         ROR|[PHL -> 4]|
|FLG|           Flagstaff|         FLG|[PHL -> 2]|
|ABI|Abilene Regional ...|         ABI|[PHL -> 2]|
|GRB|           Green Bay|         GRB|[PHL -> 2]|
+---+--------------------+------------+----------+
only showing top 5 rows



In [59]:
g_sp.printSchema()

root
 |-- id: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- airport_code: string (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)



In [60]:
g_sp.withColumn('distance', F.map_values(F.col('distances'))) \
    .orderBy(F.desc('distance')).show(20, truncate=False)

+---+-----------------------------------+------------+----------+--------+
|id |airport_name                       |airport_code|distances |distance|
+---+-----------------------------------+------------+----------+--------+
|YAP|Yap International Airport          |YAP         |[PHL -> 4]|[4]     |
|ROR|Koror Airport                      |ROR         |[PHL -> 4]|[4]     |
|ROP|Rota International Airport         |ROP         |[PHL -> 4]|[4]     |
|SPN|Saipan International Airport       |SPN         |[PHL -> 4]|[4]     |
|MKK|Molokai Airport                    |MKK         |[PHL -> 3]|[3]     |
|GST|Gustavus Airport                   |GST         |[PHL -> 3]|[3]     |
|LNY|Lanai Airport                      |LNY         |[PHL -> 3]|[3]     |
|BFF|Western Nebraska Regional Airport  |BFF         |[PHL -> 3]|[3]     |
|GUM|Guam International Airport         |GUM         |[PHL -> 3]|[3]     |
|WRG|Wrangell Airport                   |WRG         |[PHL -> 3]|[3]     |
|PSG|Petersburg James A. 

#### Triangle Count

In [61]:
%%time
g_tr = g.triangleCount()
g_tr = g_tr.checkpoint()

CPU times: user 4.02 ms, sys: 1.91 ms, total: 5.93 ms
Wall time: 30.5 s


In [62]:
g_tr.orderBy(F.desc('count')) \
    .drop('id') \
    .show(5, truncate=False)

+-----+--------------------------------------------+------------+
|count|airport_name                                |airport_code|
+-----+--------------------------------------------+------------+
|3248 |Chicago O'Hare International Airport Airport|ORD         |
|3238 |Atlanta Hartsfield International Airport    |ATL         |
|2965 |Cincinnati                                  |CVG         |
|2765 |Detroit Metropolitan Airport                |DTW         |
|2722 |Minneapolis/St.Paul International Airport   |MSP         |
+-----+--------------------------------------------+------------+
only showing top 5 rows

