### Setup

In [1]:
import sys
import pyspark.sql.functions as F
from graphframes import GraphFrame
from graphframes.examples import Graphs

In [2]:
print(sys.version)

3.6.8 (default, Aug  2 2019, 17:42:44) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-28)]


In [3]:
spark

In [4]:
# Set the checkpoint directory
sc.setCheckpointDir('/tmp')

### Data Import and Cleaning

The data analyzed here contains flight information for all domestic flights from 1987-2012. The raw data contains both airline and airport codes, and so additional lookup tables are imported to make results human-readable

In [5]:
# Read in airline codes mapping
carrier_codes = spark.read.csv('s3://jornaya-ds-us-east-1-sandbox/csnyder/L_UNIQUE_CARRIERS.csv', header=True) \
    .selectExpr('Code as OP_UNIQUE_CARRIER', 'Description as carrier_name')
# Read in airport codes mapping
airport_codes = spark.read.csv('s3://jornaya-ds-us-east-1-sandbox/csnyder/L_AIRPORT.csv', header=True) \
    .select('Code', F.split(F.col('Description'), ': ').alias('darray')) \
    .select('Code', F.col('darray')[0].alias('airport_city'), F.col('darray')[1].alias('airport_name'))

In [65]:
carrier_codes.limit(5).toPandas()

Unnamed: 0,OP_UNIQUE_CARRIER,carrier_name
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.


In [66]:
airport_codes.limit(5).toPandas()

Unnamed: 0,Code,airport_city,airport_name
0,01A,"Afognak Lake, AK",Afognak Lake Airport
1,03A,"Granite Mountain, AK",Bear Creek Mining Strip
2,04A,"Lik, AK",Lik Mining Camp
3,05A,"Little Squaw, AK",Little Squaw Airport
4,06A,"Kizhuyak, AK",Kizhuyak Bay


In [8]:
# Read in the airline linkage data, select only subset of columns, join airline codes
air = spark.read.csv('s3://jornaya-ds-us-east-1-sandbox/csnyder/dot_carrier_data', header=True) \
    .select('FL_DATE', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'DEST_STATE_ABR',
            'CANCELLED', 'CARRIER_DELAY', 'WEATHER_DELAY', 'year') \
    .join(F.broadcast(carrier_codes), on='OP_UNIQUE_CARRIER', how='inner')

In [9]:
# delays are in minutes and are null when there is no delay. Fill nulls as 0.
air = air.na.fill({'CARRIER_DELAY': 0, 'WEATHER_DELAY': 0})

In [10]:
air.count()

34426945

In [11]:
air.limit(10).toPandas()

Unnamed: 0,OP_UNIQUE_CARRIER,FL_DATE,ORIGIN,ORIGIN_STATE_ABR,DEST,DEST_STATE_ABR,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,year,carrier_name
0,AA,2016-10-01,JFK,NY,LAX,CA,0.0,0.0,0.0,2016,American Airlines Inc.
1,AA,2016-10-01,LAX,CA,JFK,NY,0.0,0.0,0.0,2016,American Airlines Inc.
2,AA,2016-10-01,JFK,NY,LAX,CA,0.0,0.0,0.0,2016,American Airlines Inc.
3,AA,2016-10-01,DFW,TX,HNL,HI,0.0,42.0,0.0,2016,American Airlines Inc.
4,AA,2016-10-01,OKC,OK,DFW,TX,0.0,0.0,0.0,2016,American Airlines Inc.
5,AA,2016-10-01,OGG,HI,DFW,TX,0.0,0.0,0.0,2016,American Airlines Inc.
6,AA,2016-10-01,DFW,TX,OGG,HI,0.0,0.0,0.0,2016,American Airlines Inc.
7,AA,2016-10-01,HNL,HI,DFW,TX,0.0,0.0,0.0,2016,American Airlines Inc.
8,AA,2016-10-01,JFK,NY,SFO,CA,0.0,0.0,0.0,2016,American Airlines Inc.
9,AA,2016-10-01,LAX,CA,JFK,NY,0.0,0.0,0.0,2016,American Airlines Inc.


#### Quick Summaries

Carriers with the most flights...

In [12]:
air \
    .groupBy('OP_UNIQUE_CARRIER','carrier_name') \
    .count() \
    .orderBy(F.desc('count')) \
    .limit(10).toPandas()

Unnamed: 0,OP_UNIQUE_CARRIER,carrier_name,count
0,WN,Southwest Airlines Co.,7217558
1,DL,Delta Air Lines Inc.,5039221
2,AA,American Airlines Inc.,4537542
3,OO,SkyWest Airlines Inc.,3768071
4,UA,United Air Lines Inc.,3118434
5,EV,ExpressJet Airlines LLC,2370381
6,B6,JetBlue Airways,1575234
7,MQ,Envoy Air,1172166
8,AS,Alaska Airlines Inc.,1093614
9,NK,Spirit Air Lines,705811


Carriers with the least amount of flights...

In [13]:
air \
    .groupBy('OP_UNIQUE_CARRIER', 'carrier_name') \
    .count() \
    .orderBy('count') \
    .limit(10).toPandas()

Unnamed: 0,OP_UNIQUE_CARRIER,carrier_name,count
0,FL,AirTran Airways Corporation,79495
1,G4,Allegiant Air,161696
2,VX,Virgin America,276598
3,YV,Mesa Airlines Inc.,346650
4,9E,Endeavor Air Inc.,393095
5,HA,Hawaiian Airlines Inc.,439994
6,OH,PSA Airlines Inc.,446234
7,YX,Republic Airline,503932
8,F9,Frontier Airlines Inc.,567839
9,US,US Airways Inc.,613380


Carriers who had the highest flight delays (in total minutes) in 2019...

In [14]:
air \
    .filter('year = 2019') \
    .groupBy('carrier_name') \
    .agg(
        F.sum('CARRIER_DELAY').alias('total_minutes_delayed'),
        F.count('*').alias('flight_cnt'),
        F.avg(F.col('CARRIER_DELAY')).alias('avg_delay'),
        F.stddev(F.col('CARRIER_DELAY')).alias('std_delay')
    ) \
    .orderBy(F.desc('std_delay')) \
    .limit(10).toPandas()

Unnamed: 0,carrier_name,total_minutes_delayed,flight_cnt,avg_delay,std_delay
0,SkyWest Airlines Inc.,2833063.0,481377,5.885331,45.766722
1,ExpressJet Airlines LLC,540128.0,79142,6.824796,45.065704
2,Allegiant Air,323610.0,65475,4.942497,37.037561
3,Mesa Airlines Inc.,702506.0,131512,5.341763,36.72102
4,JetBlue Airways,1225544.0,172933,7.086814,34.051539
5,American Airlines Inc.,2722610.0,548425,4.964416,33.32184
6,Delta Air Lines Inc.,2200978.0,571071,3.854123,30.858868
7,Envoy Air,635394.0,188743,3.366451,29.457514
8,Endeavor Air Inc.,579008.0,147178,3.934066,28.76362
9,United Air Lines Inc.,1335354.0,361121,3.697802,28.520798


### Creating the GraphFrame Object

In [15]:
# Edges
e = air.selectExpr('ORIGIN as src', 'DEST as dst', 'OP_UNIQUE_CARRIER as carrier', 'year') \
    .filter('src is not null') \
    .filter('dst is not null')
# Vertices
v0 = e.selectExpr('src as id').union(e.selectExpr('dst as id')).distinct()
v = v0 \
    .join(F.broadcast(airport_codes), v0.id==airport_codes.Code, how='inner')

In [16]:
e.explain()

== Physical Plan ==
*(2) Project [ORIGIN#79 AS src#337, DEST#88 AS dst#338, OP_UNIQUE_CARRIER#72 AS carrier#339, year#104]
+- *(2) BroadcastHashJoin [OP_UNIQUE_CARRIER#72], [OP_UNIQUE_CARRIER#14], Inner, BuildRight
   :- *(2) Project [OP_UNIQUE_CARRIER#72, ORIGIN#79, DEST#88, year#104]
   :  +- *(2) Filter ((isnotnull(ORIGIN#79) && isnotnull(DEST#88)) && isnotnull(OP_UNIQUE_CARRIER#72))
   :     +- *(2) FileScan csv [OP_UNIQUE_CARRIER#72,ORIGIN#79,DEST#88,year#104] Batched: false, Format: CSV, Location: InMemoryFileIndex[s3://jornaya-ds-us-east-1-sandbox/csnyder/dot_carrier_data], PartitionCount: 6, PartitionFilters: [], PushedFilters: [IsNotNull(ORIGIN), IsNotNull(DEST), IsNotNull(OP_UNIQUE_CARRIER)], ReadSchema: struct<OP_UNIQUE_CARRIER:string,ORIGIN:string,DEST:string>
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]))
      +- *(1) Project [Code#10 AS OP_UNIQUE_CARRIER#14]
         +- *(1) Filter isnotnull(Code#10)
            +- *(1) FileScan csv [Co

Checkpointing the data speeds up subsequent queries by caching the data to disk and also by truncating the query plan

In [17]:
e = e.checkpoint()
v = v.checkpoint()

In [18]:
e.explain()

== Physical Plan ==
Scan ExistingRDD[src#337,dst#338,carrier#339,year#104]


In [19]:
# Create the Graphframe object
g = GraphFrame(v, e)

In [20]:
g.vertices.limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name
0,INL,INL,"International Falls, MN",Falls International Einarson Field
1,BUR,BUR,"Burbank, CA",Bob Hope
2,IDA,IDA,"Idaho Falls, ID",Idaho Falls Regional
3,EUG,EUG,"Eugene, OR",Mahlon Sweet Field
4,PVD,PVD,"Providence, RI",Theodore Francis Green State


In [21]:
g.edges.limit(5).toPandas()

Unnamed: 0,src,dst,carrier,year
0,JFK,LAX,AA,2016
1,LAX,JFK,AA,2016
2,JFK,LAX,AA,2016
3,DFW,HNL,AA,2016
4,OKC,DFW,AA,2016


### InDegrees/OutDegrees

In [22]:
%%time
g.inDegrees \
    .join(F.broadcast(airport_codes), g.inDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'inDegree') \
    .orderBy(F.desc('inDegree')) \
    .limit(5).toPandas()

CPU times: user 8.4 ms, sys: 4.1 ms, total: 12.5 ms
Wall time: 4.8 s


Unnamed: 0,airport_name,id,inDegree
0,Hartsfield-Jackson Atlanta International,ATL,2117499
1,Chicago O'Hare International,ORD,1637098
2,Dallas/Fort Worth International,DFW,1369154
3,Denver International,DEN,1265394
4,Los Angeles International,LAX,1210999


In [23]:
%%time
g.inDegrees \
    .join(F.broadcast(airport_codes), g.inDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'inDegree') \
    .orderBy('inDegree') \
    .limit(5).toPandas()

CPU times: user 12.1 ms, sys: 1.51 ms, total: 13.6 ms
Wall time: 2.76 s


Unnamed: 0,airport_name,id,inDegree
0,Northern Colorado Regional,FNL,1
1,Youngstown-Warren Regional,YNG,2
2,Mobile Downtown,BFM,66
3,Florence Regional,FLO,170
4,Owensboro Daviess County Regional,OWB,182


In [24]:
%%time
g.outDegrees \
    .join(F.broadcast(airport_codes), g.outDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'outDegree') \
    .orderBy(F.desc('outDegree')) \
    .limit(5).toPandas()

CPU times: user 7.73 ms, sys: 3.99 ms, total: 11.7 ms
Wall time: 2.21 s


Unnamed: 0,airport_name,id,outDegree
0,Hartsfield-Jackson Atlanta International,ATL,2117550
1,Chicago O'Hare International,ORD,1637032
2,Dallas/Fort Worth International,DFW,1369152
3,Denver International,DEN,1265363
4,Los Angeles International,LAX,1210925


In [25]:
%%time
g.outDegrees \
    .join(F.broadcast(airport_codes), g.outDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'outDegree') \
    .orderBy('outDegree') \
    .limit(5).toPandas()

CPU times: user 7.47 ms, sys: 2.88 ms, total: 10.3 ms
Wall time: 1.87 s


Unnamed: 0,airport_name,id,outDegree
0,Tokeen Airport,TKI,1
1,Wendover Airport,ENV,1
2,Ellington,EFD,1
3,Middle Georgia Regional,MCN,1
4,Youngstown-Warren Regional,YNG,2


### SubGraphs

Let's take a look at only American Airlines data.

In [26]:
gAA = g.filterEdges("carrier = 'AA'")

What are the most common airports that American Airlines flights fly into?

In [27]:
%%time
gAA_inD_all = gAA.inDegrees \
    .join(F.broadcast(airport_codes), gAA.inDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'inDegree')

inflightsum = gAA_inD_all.groupBy().sum('InDegree').collect()[0][0]

gAA_inD_all.withColumn('pct', F.col('inDegree') / inflightsum) \
    .orderBy(F.desc('inDegree')) \
    .limit(5).toPandas()

CPU times: user 14.9 ms, sys: 1.28 ms, total: 16.2 ms
Wall time: 5.36 s


Unnamed: 0,airport_name,id,inDegree,pct
0,Dallas/Fort Worth International,DFW,815190,0.179655
1,Charlotte Douglas International,CLT,399701,0.088088
2,Chicago O'Hare International,ORD,332335,0.073241
3,Miami International,MIA,286840,0.063215
4,Phoenix Sky Harbor International,PHX,244143,0.053805


What are the most common airports that American Airlines flights fly into in 2019?

In [28]:
%%time
gAA_2019 = gAA.filterEdges("year = '2019'")

gAA_inD_2019 = gAA_2019.inDegrees \
    .join(F.broadcast(airport_codes), gAA_2019.inDegrees.id==airport_codes.Code, how='inner') \
    .select('airport_name', 'id', 'inDegree')

inflightsum_2019 = gAA_inD_2019.groupBy().sum('InDegree').collect()[0][0]

gAA_inD_2019.withColumn('pct', F.col('inDegree') / inflightsum_2019) \
    .orderBy(F.desc('inDegree')) \
    .limit(5).toPandas()

CPU times: user 15.2 ms, sys: 3.11 ms, total: 18.3 ms
Wall time: 3.91 s


Unnamed: 0,airport_name,id,inDegree,pct
0,Dallas/Fort Worth International,DFW,88246,0.160908
1,Charlotte Douglas International,CLT,57245,0.104381
2,Chicago O'Hare International,ORD,38027,0.069339
3,Phoenix Sky Harbor International,PHX,33727,0.061498
4,Miami International,MIA,29660,0.054082


### Algorithm Tour

#### Breadth-First Search (BFS)

In [29]:
%%time
g_bfs = g.filterEdges('year = 2019') \
    .bfs('id="PHL"', 'id="HNL"', maxPathLength=2)
g_bfs = g_bfs.checkpoint()

CPU times: user 6.7 ms, sys: 4.34 ms, total: 11 ms
Wall time: 31.5 s


In [30]:
g_bfsd = g_bfs.distinct()
g_bfsd = g_bfsd.checkpoint()

In [31]:
g_bfsd.drop('v1').limit(10).toPandas()

Unnamed: 0,from,e0,e1,to
0,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, ORD, EV, 2019)","(ORD, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
1,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, SEA, AA, 2019)","(SEA, HNL, DL, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
2,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, LAX, AA, 2019)","(LAX, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
3,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, LAX, NK, 2019)","(LAX, HNL, AS, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
4,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, DTW, OH, 2019)","(DTW, HNL, DL, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
5,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, DEN, F9, 2019)","(DEN, HNL, UA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
6,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, DEN, AA, 2019)","(DEN, HNL, UA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
7,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, ATL, AA, 2019)","(ATL, HNL, DL, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
8,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, ATL, F9, 2019)","(ATL, HNL, DL, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
9,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, IAH, YV, 2019)","(IAH, HNL, UA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."


In [32]:
g_bfsd.drop('v1') \
    .filter(g_bfsd.e0['carrier'] == 'AA') \
    .filter(g_bfsd.e1['carrier'] == 'AA') \
    .limit(10).toPandas()

Unnamed: 0,from,e0,e1,to
0,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, LAX, AA, 2019)","(LAX, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
1,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, DFW, AA, 2019)","(DFW, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
2,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, ORD, AA, 2019)","(ORD, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."
3,"(PHL, PHL, Philadelphia, PA, Philadelphia Inte...","(PHL, PHX, AA, 2019)","(PHX, HNL, AA, 2019)","(HNL, HNL, Honolulu, HI, Daniel K Inouye Inter..."


In [33]:
g_bfsd.count()

93

#### Connected Components

Let's run connected components on the entire dataset and see what happens.

In [34]:
%%time
g_cc = g.connectedComponents()
g_cc = g_cc.checkpoint()

CPU times: user 9.73 ms, sys: 184 µs, total: 9.91 ms
Wall time: 49.6 s


In [35]:
g_cc.printSchema()

root
 |-- id: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- airport_city: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- component: long (nullable = true)



In [67]:
g_cc.groupBy('component').count().orderBy(F.desc('component')).limit(10).toPandas()

Unnamed: 0,component,count
0,8589934592,376


Not that interesting, but also expected. You would think that you can get to any airport from any airport in the US. Let's create a subgraph using just Virgin America (VX) flight data.

In [37]:
%%time
gVX = g.filterEdges('carrier = "VX"')
gVX_cc = gVX.connectedComponents()
gVX_cc = gVX_cc.checkpoint()

CPU times: user 7.53 ms, sys: 775 µs, total: 8.3 ms
Wall time: 25.3 s


In [38]:
gVX_cc.limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,component
0,INL,INL,"International Falls, MN",Falls International Einarson Field,8589934592
1,BUR,BUR,"Burbank, CA",Bob Hope,34359738368
2,IDA,IDA,"Idaho Falls, ID",Idaho Falls Regional,60129542144
3,EUG,EUG,"Eugene, OR",Mahlon Sweet Field,85899345920
4,PVD,PVD,"Providence, RI",Theodore Francis Green State,103079215104


In [39]:
gVX_cc.groupBy('component').count().orderBy(F.desc('count')).limit(5).toPandas()

Unnamed: 0,component,count
0,171798691840,33
1,3324304687105,1
2,6141803233280,1
3,3212635537408,1
4,1821066133504,1


In [40]:
gVX_cc.filter('component = 171798691840') \
    .toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,component
0,DCA,DCA,"Washington, DC",Ronald Reagan Washington National,171798691840
1,AUS,AUS,"Austin, TX",Austin - Bergstrom International,171798691840
2,LAS,LAS,"Las Vegas, NV",McCarran International,171798691840
3,DEN,DEN,"Denver, CO",Denver International,171798691840
4,ONT,ONT,"Ontario, CA",Ontario International,171798691840
5,LAX,LAX,"Los Angeles, CA",Los Angeles International,171798691840
6,JFK,JFK,"New York, NY",John F. Kennedy International,171798691840
7,SLC,SLC,"Salt Lake City, UT",Salt Lake City International,171798691840
8,MSY,MSY,"New Orleans, LA",Louis Armstrong New Orleans International,171798691840
9,LGA,LGA,"New York, NY",LaGuardia,171798691840


#### Label Propagation Algorithm (LPA)

In [41]:
g_lpa = g.labelPropagation(maxIter=5)
g_lpa = g_lpa.checkpoint()

In [42]:
g_lpa.printSchema()

root
 |-- id: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- airport_city: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- label: long (nullable = true)



In [43]:
g_lpa.limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,label
0,MSY,MSY,"New Orleans, LA",Louis Armstrong New Orleans International,4655744548864
1,OTZ,OTZ,"Kotzebue, AK",Ralph Wien Memorial,1589137899521
2,FLG,FLG,"Flagstaff, AZ",Flagstaff Pulliam,4655744548864
3,HHH,HHH,"Hilton Head, SC",Hilton Head Airport,4655744548864
4,ABI,ABI,"Abilene, TX",Abilene Regional,4655744548864


In [44]:
g_lpa.groupBy('label').count().orderBy(F.desc('count')).limit(5).toPandas()

Unnamed: 0,label,count
0,4655744548864,352
1,1589137899521,17
2,5583457484800,5
3,5102421147648,2


In [45]:
g_lpa.filter('label = 4655744548864') \
    .limit(10).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,label
0,MSY,MSY,"New Orleans, LA",Louis Armstrong New Orleans International,4655744548864
1,FLG,FLG,"Flagstaff, AZ",Flagstaff Pulliam,4655744548864
2,HHH,HHH,"Hilton Head, SC",Hilton Head Airport,4655744548864
3,ABI,ABI,"Abilene, TX",Abilene Regional,4655744548864
4,GRB,GRB,"Green Bay, WI",Green Bay Austin Straubel International,4655744548864
5,TXK,TXK,"Texarkana, AR",Texarkana Regional-Webb Field,4655744548864
6,PVU,PVU,"Provo, UT",Provo Municipal,4655744548864
7,LRD,LRD,"Laredo, TX",Laredo International,4655744548864
8,HRL,HRL,"Harlingen/San Benito, TX",Valley International,4655744548864
9,ILG,ILG,"Wilmington, DE",New Castle,4655744548864


In [46]:
g_lpa.filter('label = 1589137899521') \
    .toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,label
0,OTZ,OTZ,"Kotzebue, AK",Ralph Wien Memorial,1589137899521
1,SCC,SCC,"Deadhorse, AK",Deadhorse Airport,1589137899521
2,CDV,CDV,"Cordova, AK",Merle K Mudhole Smith,1589137899521
3,WRG,WRG,"Wrangell, AK",Wrangell Airport,1589137899521
4,DLG,DLG,"Dillingham, AK",Dillingham Airport,1589137899521
5,GST,GST,"Gustavus, AK",Gustavus Airport,1589137899521
6,OME,OME,"Nome, AK",Nome Airport,1589137899521
7,BRW,BRW,"Barrow, AK",Wiley Post/Will Rogers Memorial,1589137899521
8,PSG,PSG,"Petersburg, AK",Petersburg James A Johnson,1589137899521
9,JNU,JNU,"Juneau, AK",Juneau International,1589137899521


In [47]:
g_lpa.filter('label = 5583457484800') \
    .limit(10).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,label
0,ITO,ITO,"Hilo, HI",Hilo International,5583457484800
1,GUM,GUM,"Guam, TT",Guam International,5583457484800
2,KOA,KOA,"Kona, HI",Ellison Onizuka Kona International at Keahole,5583457484800
3,PPG,PPG,"Pago Pago, TT",Pago Pago International,5583457484800
4,LIH,LIH,"Lihue, HI",Lihue Airport,5583457484800


In [48]:
g_lpa.filter('label = 5102421147648') \
    .limit(10).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,label
0,HNL,HNL,"Honolulu, HI",Daniel K Inouye International,5102421147648
1,SPN,SPN,"Saipan, TT",Francisco C. Ada Saipan International,5102421147648


#### PageRank

In [49]:
%%time
g_pr = g.pageRank(resetProbability=0.15, tol=0.01)
g_pr_v = g_pr.vertices.checkpoint()

CPU times: user 21.9 ms, sys: 795 µs, total: 22.7 ms
Wall time: 1min 57s


In [50]:
g_pr_v.orderBy(F.desc('pagerank')).limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,pagerank
0,ATL,ATL,"Atlanta, GA",Hartsfield-Jackson Atlanta International,19.673616
1,ORD,ORD,"Chicago, IL",Chicago O'Hare International,16.814122
2,DFW,DFW,"Dallas/Fort Worth, TX",Dallas/Fort Worth International,14.472285
3,DEN,DEN,"Denver, CO",Denver International,13.391834
4,LAX,LAX,"Los Angeles, CA",Los Angeles International,10.287024


In [51]:
%%time
g_pr_10 = g.pageRank(resetProbability=0.15, maxIter=5)
g_pr_10_v = g_pr_10.vertices.checkpoint()

CPU times: user 1.49 ms, sys: 6.1 ms, total: 7.59 ms
Wall time: 26.3 s


In [52]:
g_pr_10_v.orderBy(F.desc('pagerank')).limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,pagerank
0,ATL,ATL,"Atlanta, GA",Hartsfield-Jackson Atlanta International,21.259381
1,ORD,ORD,"Chicago, IL",Chicago O'Hare International,17.967079
2,DFW,DFW,"Dallas/Fort Worth, TX",Dallas/Fort Worth International,15.459657
3,DEN,DEN,"Denver, CO",Denver International,13.937019
4,LAX,LAX,"Los Angeles, CA",Los Angeles International,10.450192


In [53]:
%%time
g_pr_ORD = g.pageRank(resetProbability=0.15, maxIter=10, sourceId="ORD")
g_pr_ORD_v = g_pr_ORD.vertices.checkpoint()

CPU times: user 11.2 ms, sys: 521 µs, total: 11.8 ms
Wall time: 44.9 s


In [54]:
g_pr_ORD_v.orderBy(F.desc('pagerank')).limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,pagerank
0,ORD,ORD,"Chicago, IL",Chicago O'Hare International,0.194374
1,ATL,ATL,"Atlanta, GA",Hartsfield-Jackson Atlanta International,0.050519
2,DFW,DFW,"Dallas/Fort Worth, TX",Dallas/Fort Worth International,0.032653
3,DEN,DEN,"Denver, CO",Denver International,0.029074
4,LAX,LAX,"Los Angeles, CA",Los Angeles International,0.026876


In [55]:
%%time
g_pr_parellel = g.parallelPersonalizedPageRank(resetProbability=0.15, sourceIds=["ORD", "JNU", "PHL"], maxIter=10)
g_pr_parellel_v = g_pr_parellel.vertices.checkpoint()

CPU times: user 11.3 ms, sys: 756 µs, total: 12 ms
Wall time: 47 s


In [56]:
g_pr_parellel_v.orderBy(F.desc('pageranks')).drop('id').limit(5).toPandas()

Unnamed: 0,Code,airport_city,airport_name,pageranks
0,ORD,"Chicago, IL",Chicago O'Hare International,"[0.19437410564856167, 0.024266941071439452, 0...."
1,ATL,"Atlanta, GA",Hartsfield-Jackson Atlanta International,"[0.05051850313651287, 0.02480123371324171, 0.0..."
2,DFW,"Dallas/Fort Worth, TX",Dallas/Fort Worth International,"[0.03265316809538042, 0.01956270291753715, 0.0..."
3,DEN,"Denver, CO",Denver International,"[0.029074069503041387, 0.023178439293135362, 0..."
4,LAX,"Los Angeles, CA",Los Angeles International,"[0.02687642330587586, 0.02562032911211432, 0.0..."


#### Shortest Path

In [57]:
%%time
g_sp = g.shortestPaths(landmarks=["PHL"])
g_sp = g_sp.checkpoint()

CPU times: user 6.87 ms, sys: 0 ns, total: 6.87 ms
Wall time: 31.3 s


In [63]:
g_sp.limit(5).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,distances
0,MSY,MSY,"New Orleans, LA",Louis Armstrong New Orleans International,{'PHL': 1}
1,OTZ,OTZ,"Kotzebue, AK",Ralph Wien Memorial,{'PHL': 3}
2,FLG,FLG,"Flagstaff, AZ",Flagstaff Pulliam,{'PHL': 2}
3,HHH,HHH,"Hilton Head, SC",Hilton Head Airport,{'PHL': 2}
4,ABI,ABI,"Abilene, TX",Abilene Regional,{'PHL': 2}


In [59]:
g_sp.printSchema()

root
 |-- id: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- airport_city: string (nullable = true)
 |-- airport_name: string (nullable = true)
 |-- distances: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)



In [64]:
g_sp.withColumn('distance', F.map_values(F.col('distances'))[0]) \
    .orderBy(F.desc('distance')).limit(20).toPandas()

Unnamed: 0,id,Code,airport_city,airport_name,distances,distance
0,SPN,SPN,"Saipan, TT",Francisco C. Ada Saipan International,{'PHL': 4},4
1,BRW,BRW,"Barrow, AK",Wiley Post/Will Rogers Memorial,{'PHL': 3},3
2,SCC,SCC,"Deadhorse, AK",Deadhorse Airport,{'PHL': 3},3
3,YAK,YAK,"Yakutat, AK",Yakutat Airport,{'PHL': 3},3
4,OTZ,OTZ,"Kotzebue, AK",Ralph Wien Memorial,{'PHL': 3},3
5,WRG,WRG,"Wrangell, AK",Wrangell Airport,{'PHL': 3},3
6,YNG,YNG,"Youngstown/Warren, OH",Youngstown-Warren Regional,{'PHL': 3},3
7,ENV,ENV,"Wendover, UT",Wendover Airport,{'PHL': 3},3
8,CDV,CDV,"Cordova, AK",Merle K Mudhole Smith,{'PHL': 3},3
9,DLG,DLG,"Dillingham, AK",Dillingham Airport,{'PHL': 3},3


#### Triangle Count

In [61]:
%%time
g_tr = g.triangleCount()
g_tr = g_tr.checkpoint()

CPU times: user 4.44 ms, sys: 264 µs, total: 4.7 ms
Wall time: 19.1 s


In [62]:
g_tr.orderBy(F.desc('count')) \
    .drop('id') \
    .limit(5).toPandas()

Unnamed: 0,count,Code,airport_city,airport_name
0,2691,ATL,"Atlanta, GA",Hartsfield-Jackson Atlanta International
1,2620,ORD,"Chicago, IL",Chicago O'Hare International
2,2620,DEN,"Denver, CO",Denver International
3,2390,DFW,"Dallas/Fort Worth, TX",Dallas/Fort Worth International
4,2255,CLT,"Charlotte, NC",Charlotte Douglas International
