In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .config(
        "spark.jars",
        "/opt/bitnami/spark/jars/gcs-connector-hadoop3-latest.jar,"
        "/opt/bitnami/spark/jars/spark-bigquery-with-dependencies_2.12-0.42.4.jar"
        ) \
        .appName('gcs-bq-pyspark') \
        .getOrCreate()

25/08/12 09:39:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("WARN")

spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', "/opt/keys/credentials.json")

In [4]:
# File information
bucket_name = "data_expo_bucket"
file_name = "carriers_data.csv"
file_path = f"gs://{bucket_name}/{file_name}"

In [5]:
# Read CSV file
carriers_data = spark.read.option("inferSchema", "true").option("header", "true").csv(file_path)

                                                                                

In [13]:
# show the data
carriers_data.show(truncate=False)

+----+--------------------------------------------+
|Code|Description                                 |
+----+--------------------------------------------+
|02Q |Titan Airways                               |
|04Q |Tradewind Aviation                          |
|05Q |Comlux Aviation, AG                         |
|06Q |Master Top Linhas Aereas Ltd.               |
|07Q |Flair Airlines Ltd.                         |
|09Q |Swift Air, LLC                              |
|0BQ |DCA                                         |
|0CQ |ACM AIR CHARTER GmbH                        |
|0FQ |Maine Aviation Aircraft Charter, LLC        |
|0GQ |Inter Island Airways, d/b/a Inter Island Air|
|0HQ |Polar Airlines de Mexico d/b/a Nova Air     |
|0J  |JetClub AG                                  |
|0JQ |Vision Airlines                             |
|0KQ |Mokulele Flight Services, Inc.              |
|0LQ |Metropix UK, LLP.                           |
|0MQ |Multi-Aero, Inc. d/b/a Air Choice One       |
|0Q  |Flying

In [25]:
# Count number of records
carriers_data.count()

1491

In [26]:
# Count number of unique carrier code
carriers_data.select(sf.count(sf.col("Code"))).show()

+-----------+
|count(Code)|
+-----------+
|       1491|
+-----------+



In [23]:
# Read file schema
carriers_data.schema

StructType([StructField('Code', StringType(), True), StructField('Description', StringType(), True)])

In [24]:
# Test null values
carriers_data_test_null = carriers_data.select([sf.count(sf.when(sf.col(c).isin(['null', 'NULL', 'NA', 'NaN']) | sf.col(c).isNull(), c)).alias(c) for c in carriers_data.columns])

In [19]:
carriers_data_test_null.show()

+----+-----------+
|Code|Description|
+----+-----------+
|   1|          0|
+----+-----------+



In [20]:
# NA here can be a specific carrier code
carriers_data.filter(carriers_data.Code == 'NA').show(truncate=False)

+----+-----------------------+
|Code|Description            |
+----+-----------------------+
|NA  |North American Airlines|
+----+-----------------------+



                                                                                

In [22]:
# Data is clean, so no modification is needed
# Push data to BigQuery as Table
spark.conf.set('temporaryGcsBucket', 'data_expo_temp_bucket')
output_dataset = "data-expo-pipeline.data_expo_dataset.carrier_table"

In [27]:
carriers_data.write.format('bigquery').option('table', output_dataset).mode("overwrite").save()

                                                                                

In [28]:
# Stop the Spark instance
spark.stop()