In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .config(
        "spark.jars",
        "/opt/bitnami/spark/jars/gcs-connector-hadoop3-latest.jar,"
        "/opt/bitnami/spark/jars/spark-bigquery-with-dependencies_2.12-0.42.4.jar"
        ) \
        .appName('gcs-bq-pyspark') \
        .getOrCreate()

25/08/12 09:23:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("WARN")

spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', "/opt/keys/credentials.json")

In [4]:
# File information
bucket_name = "data_expo_bucket"
file_name = "airports_data.csv"
file_path = f"gs://{bucket_name}/{file_name}"

In [5]:
# Read CSV file
airports_data = spark.read.option("inferSchema", "true").option("header", "true").csv(file_path)

                                                                                

In [6]:
# Show the data
airports_data.show()

+----+--------------------+------------------+-----+-------+-----------+------------+
|iata|             airport|              city|state|country|        lat|        long|
+----+--------------------+------------------+-----+-------+-----------+------------+
| 00M|            Thigpen |       Bay Springs|   MS|    USA|31.95376472|-89.23450472|
| 00R|Livingston Municipal|        Livingston|   TX|    USA|30.68586111|-95.01792778|
| 00V|         Meadow Lake|  Colorado Springs|   CO|    USA|38.94574889|-104.5698933|
| 01G|        Perry-Warsaw|             Perry|   NY|    USA|42.74134667|-78.05208056|
| 01J|    Hilliard Airpark|          Hilliard|   FL|    USA| 30.6880125|-81.90594389|
| 01M|   Tishomingo County|           Belmont|   MS|    USA|34.49166667|-88.20111111|
| 02A|         Gragg-Wade |           Clanton|   AL|    USA|32.85048667|-86.61145333|
| 02C|             Capitol|        Brookfield|   WI|    USA|   43.08751|-88.17786917|
| 02G|   Columbiana County|    East Liverpool|   OH|  

In [7]:
# Read file schema
airports_data.schema

StructType([StructField('iata', StringType(), True), StructField('airport', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), True), StructField('country', StringType(), True), StructField('lat', DoubleType(), True), StructField('long', DoubleType(), True)])

In [8]:
# Rename columns
airports_data = airports_data.withColumnRenamed("iata", "IATA").withColumnRenamed("lat", "latitude").withColumnRenamed("long", "longitude")

In [9]:
# DataFrame with new columns
airports_data.show()

+----+--------------------+------------------+-----+-------+-----------+------------+
|IATA|             airport|              city|state|country|   latitude|   longitude|
+----+--------------------+------------------+-----+-------+-----------+------------+
| 00M|            Thigpen |       Bay Springs|   MS|    USA|31.95376472|-89.23450472|
| 00R|Livingston Municipal|        Livingston|   TX|    USA|30.68586111|-95.01792778|
| 00V|         Meadow Lake|  Colorado Springs|   CO|    USA|38.94574889|-104.5698933|
| 01G|        Perry-Warsaw|             Perry|   NY|    USA|42.74134667|-78.05208056|
| 01J|    Hilliard Airpark|          Hilliard|   FL|    USA| 30.6880125|-81.90594389|
| 01M|   Tishomingo County|           Belmont|   MS|    USA|34.49166667|-88.20111111|
| 02A|         Gragg-Wade |           Clanton|   AL|    USA|32.85048667|-86.61145333|
| 02C|             Capitol|        Brookfield|   WI|    USA|   43.08751|-88.17786917|
| 02G|   Columbiana County|    East Liverpool|   OH|  

In [10]:
# Count number of records
airports_data.count()

3376

In [11]:
# Test null in numeric columns
double_cols = airports_data.select(['latitude', 'longitude'])
double_cols.show()

+-----------+------------+
|   latitude|   longitude|
+-----------+------------+
|31.95376472|-89.23450472|
|30.68586111|-95.01792778|
|38.94574889|-104.5698933|
|42.74134667|-78.05208056|
| 30.6880125|-81.90594389|
|34.49166667|-88.20111111|
|32.85048667|-86.61145333|
|   43.08751|-88.17786917|
|40.67331278|-80.64140639|
|40.44725889|-92.22696056|
|33.93011222|-89.34285194|
|46.88384889|-96.35089861|
|41.51961917|-87.40109333|
|31.42127556|-97.79696778|
|39.60416667|-116.0050597|
|32.46047167|-85.68003611|
|41.98934083|-88.10124278|
|48.88434111|-99.62087694|
|33.53456583|-89.31256917|
|41.43156583|-74.39191722|
+-----------+------------+
only showing top 20 rows



In [12]:
double_cols_test_null = double_cols.select([sf.count(sf.when(sf.isnan(c) | sf.col(c).isNull(), c)).alias(c) for c in double_cols.columns])

In [13]:
double_cols_test_null.show()

+--------+---------+
|latitude|longitude|
+--------+---------+
|       0|        0|
+--------+---------+



In [14]:
# Test null in string columns
string_cols = airports_data.select(['IATA', 'airport', 'city', 'state', 'country'])
string_cols.show()

+----+--------------------+------------------+-----+-------+
|IATA|             airport|              city|state|country|
+----+--------------------+------------------+-----+-------+
| 00M|            Thigpen |       Bay Springs|   MS|    USA|
| 00R|Livingston Municipal|        Livingston|   TX|    USA|
| 00V|         Meadow Lake|  Colorado Springs|   CO|    USA|
| 01G|        Perry-Warsaw|             Perry|   NY|    USA|
| 01J|    Hilliard Airpark|          Hilliard|   FL|    USA|
| 01M|   Tishomingo County|           Belmont|   MS|    USA|
| 02A|         Gragg-Wade |           Clanton|   AL|    USA|
| 02C|             Capitol|        Brookfield|   WI|    USA|
| 02G|   Columbiana County|    East Liverpool|   OH|    USA|
| 03D|    Memphis Memorial|           Memphis|   MO|    USA|
| 04M|      Calhoun County|         Pittsboro|   MS|    USA|
| 04Y|    Hawley Municipal|            Hawley|   MN|    USA|
| 05C|Griffith-Merrillv...|          Griffith|   IN|    USA|
| 05F|Gatesville - City.

In [15]:
string_cols_test_null = string_cols.select([sf.count(sf.when(sf.col(c).isin(['null', 'NULL', 'NA', 'NaN']) | sf.col(c).isNull(), c)).alias(c) for c in string_cols.columns])

In [16]:
string_cols_test_null.show()

+----+-------+----+-----+-------+
|IATA|airport|city|state|country|
+----+-------+----+-----+-------+
|   0|      0|  12|   12|      0|
+----+-------+----+-----+-------+



In [17]:
# Show null values
airports_data.filter(sf.col('city').isin(['null', 'NULL', 'NA', 'NaN']) | sf.col('city').isNull()).show()

+----+--------------------+----+-----+--------------------+---------+-----------+
|IATA|             airport|city|state|             country| latitude|  longitude|
+----+--------------------+----+-----+--------------------+---------+-----------+
| CLD|MC Clellan-Paloma...|  NA|   NA|                 USA|33.127231|-117.278727|
| HHH|         Hilton Head|  NA|   NA|                 USA|32.224384| -80.697629|
| MIB|           Minot AFB|  NA|   NA|                 USA|48.415769|-101.358039|
| MQT|Marquette County ...|  NA|   NA|                 USA|46.353639| -87.395361|
| RCA|       Ellsworth AFB|  NA|   NA|                 USA|44.145094|-103.103567|
| RDR|     Grand Forks AFB|  NA|   NA|                 USA|47.961167| -97.401167|
| ROP|         Prachinburi|  NA|   NA|            Thailand|14.078333| 101.378334|
| ROR|    Babelthoup/Koror|  NA|   NA|               Palau| 7.367222| 134.544167|
| SCE|     University Park|  NA|   NA|                 USA|40.851206| -77.846302|
| SKA|       Fai

In [18]:
# Create mapping to fill in NA values
MAPPING = {"CLD": ["San Diego", "CA"], 
          "HHH": ["Hilton Head Island", "SC"],
          "MIB": ["Minot", "ND"],
          "MQT": ["Marquette", "MI"], 
          "RCA": ["Rapid City", "SD"], 
          "RDR": ["Grand Forks", "ND"],
          "ROP": ["Mueang Prachinburi", "PC"],
          "ROR": ["Airai", "PW350"],
          "SCE": ["State College", "PA"],
          "SKA": ["Spokane", "WA"],
          "SPN": ["San Jose", "TI"],
          "YAP": ["Colonia", "FMYAP"]}

In [19]:
for key, values in MAPPING.items():
    airports_data = airports_data.withColumns({'city': sf.when(sf.col('IATA') == key, values[0]).otherwise(sf.col('city')), 'state': sf.when(sf.col('IATA') == key, values[1]).otherwise(sf.col('state'))})

In [20]:
# Check null values after filling NA values
airports_data.filter(sf.col('city').isin(['null', 'NULL', 'NA', 'NaN']) | sf.col('city').isNull()).show()

+----+-------+----+-----+-------+--------+---------+
|IATA|airport|city|state|country|latitude|longitude|
+----+-------+----+-----+-------+--------+---------+
+----+-------+----+-----+-------+--------+---------+



                                                                                

In [21]:
# Test some filled values
airports_data.filter((airports_data.IATA == 'CLD') | (airports_data.IATA == 'ROR')).show()

+----+--------------------+---------+-----+-------+---------+-----------+
|IATA|             airport|     city|state|country| latitude|  longitude|
+----+--------------------+---------+-----+-------+---------+-----------+
| CLD|MC Clellan-Paloma...|San Diego|   CA|    USA|33.127231|-117.278727|
| ROR|    Babelthoup/Koror|    Airai|PW350|  Palau| 7.367222| 134.544167|
+----+--------------------+---------+-----+-------+---------+-----------+



In [26]:
spark.conf.set('temporaryGcsBucket', 'data_expo_temp_bucket')

In [27]:
output_dataset = "data-expo-pipeline.data_expo_dataset.airport_table"

In [28]:
airports_data.write.format('bigquery').option('table', output_dataset).mode("overwrite").save()

                                                                                

In [30]:
# Stop the Spark instance
spark.stop()