In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = SparkSession \
    .builder \
    .appName("Wrangling Data") \
    .getOrCreate()

In [3]:
df = spark.read.csv("airport-codes_csv.csv", header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)



In [5]:
df.show(10)

+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+
|ident|         type|                name|elevation_ft|continent|iso_country|iso_region|municipality|gps_code|iata_code|local_code|         coordinates|
+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+
|  00A|     heliport|   Total Rf Heliport|          11|       NA|         US|     US-PA|    Bensalem|     00A|     null|       00A|-74.9336013793945...|
| 00AA|small_airport|Aero B Ranch Airport|        3435|       NA|         US|     US-KS|       Leoti|    00AA|     null|      00AA|-101.473911, 38.7...|
| 00AK|small_airport|        Lowell Field|         450|       NA|         US|     US-AK|Anchor Point|    00AK|     null|      00AK|-151.695999146, 5...|
| 00AL|small_airport|        Epps Airpark|         820|       NA|         US|     

In [6]:
df.select("iata_code").groupby("iata_code").count().orderBy("count", ascending=False).show(10)

+---------+-----+
|iata_code|count|
+---------+-----+
|     null|45886|
|        0|   80|
|      OHE|    3|
|      PRI|    3|
|      CMN|    2|
|      CLG|    2|
|      TFY|    2|
|      MUP|    2|
|      IST|    2|
|      IZA|    2|
+---------+-----+
only showing top 10 rows



In [7]:
df.select("ident").groupby("ident").count().orderBy("count", ascending=False).show(10)

+-----+-----+
|ident|count|
+-----+-----+
| 06IN|    1|
| 06VA|    1|
| 0LA0|    1|
| 0MD6|    1|
| 0OH7|    1|
| 0OK9|    1|
| 11KS|    1|
| 12PR|    1|
| 16KY|    1|
| 19OI|    1|
+-----+-----+
only showing top 10 rows



In [8]:
df.select("gps_code").groupby("gps_code").count().orderBy("count", ascending=False).show(10)

+--------+-----+
|gps_code|count|
+--------+-----+
|    null|14045|
|    MBAC|    3|
|    ZYMH|    3|
|    SGGR|    2|
|    SDAZ|    2|
|    SSRA|    2|
|    WA98|    2|
|    1TS9|    2|
|    EGZJ|    2|
|    RK3D|    2|
+--------+-----+
only showing top 10 rows



In [9]:
df.select("local_code").groupby("local_code").count().orderBy("count", ascending=False).show(10)

+----------+-----+
|local_code|count|
+----------+-----+
|      null|26389|
|       AMA|    5|
|       LAN|    5|
|       ROS|    4|
|       CAR|    4|
|       HRR|    4|
|       PAL|    4|
|       AGI|    4|
|       SLR|    4|
|       TOL|    4|
+----------+-----+
only showing top 10 rows



In [10]:
df.select("iso_country").groupby("iso_country").count().orderBy("count", ascending=False).show(10)

+-----------+-----+
|iso_country|count|
+-----------+-----+
|         US|22757|
|         BR| 4334|
|         CA| 2784|
|         AU| 1963|
|         KR| 1376|
|         MX| 1181|
|         RU| 1040|
|         DE|  947|
|         GB|  911|
|         FR|  850|
+-----------+-----+
only showing top 10 rows



In [11]:
df.count()

55075

In [12]:
df.select("type").groupby("type").count().orderBy("count", ascending=False).show(10)

+--------------+-----+
|          type|count|
+--------------+-----+
| small_airport|33965|
|      heliport|11287|
|medium_airport| 4550|
|        closed| 3606|
| seaplane_base| 1016|
| large_airport|  627|
|   balloonport|   24|
+--------------+-----+



In [13]:
df.select("continent").groupby("continent").count().orderBy("count", ascending=False).show(10)

+---------+-----+
|continent|count|
+---------+-----+
|       NA|27719|
|       EU| 7840|
|       SA| 7709|
|       AS| 5350|
|       AF| 3362|
|       OC| 3067|
|       AN|   28|
+---------+-----+



In [14]:
df_airport = df.select('ident', 'type', 'name', 'continent', 'iso_country', 'iso_region').dropDuplicates()
df_airport.show(10)

+-----+-------------+--------------------+---------+-----------+----------+
|ident|         type|                name|continent|iso_country|iso_region|
+-----+-------------+--------------------+---------+-----------+----------+
| 00AS|small_airport|      Fulton Airport|       NA|         US|     US-OK|
| 02FL|small_airport|     Cuchens Airport|       NA|         US|     US-FL|
| 03NH|     heliport|     Lorden Heliport|       NA|         US|     US-NH|
| 05PS|small_airport|Mills Brothers Ai...|       NA|         US|     US-PA|
| 09LL|small_airport|   Pine Hill Airport|       NA|         US|     US-IL|
| 0CD7|small_airport|    Fox Hole Airport|       NA|         US|     US-CO|
| 0IA5|small_airport|Moore Private Air...|       NA|         US|     US-IA|
|  0S2|small_airport|Stockton Municipa...|       NA|         US|     US-KS|
| 12IL|small_airport|      Hawker Airport|       NA|         US|     US-IL|
| 13FA|small_airport|       Earle Airpark|       NA|         US|     US-FL|
+-----+-----

In [19]:
df_airport.count()

55075

In [11]:
from pyspark.sql.functions import split

In [15]:
df_airport = df_airport.withColumn('iso_region', split(df_airport['iso_region'], '-')[1])
df_airport.show(10)

+-----+-------------+--------------------+---------+-----------+----------+
|ident|         type|                name|continent|iso_country|iso_region|
+-----+-------------+--------------------+---------+-----------+----------+
| 00AS|small_airport|      Fulton Airport|       NA|         US|        OK|
| 02FL|small_airport|     Cuchens Airport|       NA|         US|        FL|
| 03NH|     heliport|     Lorden Heliport|       NA|         US|        NH|
| 05PS|small_airport|Mills Brothers Ai...|       NA|         US|        PA|
| 09LL|small_airport|   Pine Hill Airport|       NA|         US|        IL|
| 0CD7|small_airport|    Fox Hole Airport|       NA|         US|        CO|
| 0IA5|small_airport|Moore Private Air...|       NA|         US|        IA|
|  0S2|small_airport|Stockton Municipa...|       NA|         US|        KS|
| 12IL|small_airport|      Hawker Airport|       NA|         US|        IL|
| 13FA|small_airport|       Earle Airpark|       NA|         US|        FL|
+-----+-----

In [16]:
df_airport.write.parquet("airport", mode="overwrite", partitionBy=["continent"])