In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .config(
        "spark.jars",
        "/opt/bitnami/spark/jars/gcs-connector-hadoop3-latest.jar,"
        "/opt/bitnami/spark/jars/spark-bigquery-with-dependencies_2.12-0.42.4.jar"
        ) \
        .appName('gcs-bq-pyspark') \
        .getOrCreate()

25/08/12 13:34:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("WARN")

spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', "/opt/keys/credentials.json")

In [4]:
# File information
bucket_name = "data_expo_bucket"
file_name = "plane_data.csv"
file_path = f"gs://{bucket_name}/{file_name}"

In [113]:
# Read CSV file
planes_data = spark.read.option("inferSchema", "true").option("header", "true").csv(file_path)

                                                                                

In [114]:
# Show the data
planes_data.show()

+-------+----+------------+----------+-----+------+-------------+-----------+----+
|tailnum|type|manufacturer|issue_date|model|status|aircraft_type|engine_type|year|
+-------+----+------------+----------+-----+------+-------------+-----------+----+
| N050AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N051AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N052AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N054AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N055AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N056AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N057AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N058AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N059AA|NULL|        NULL|      NULL| NULL|  NULL|         NULL|       NULL|NULL|
| N0

In [115]:
# Read file schema
planes_data.schema

StructType([StructField('tailnum', StringType(), True), StructField('type', StringType(), True), StructField('manufacturer', StringType(), True), StructField('issue_date', StringType(), True), StructField('model', StringType(), True), StructField('status', StringType(), True), StructField('aircraft_type', StringType(), True), StructField('engine_type', StringType(), True), StructField('year', StringType(), True)])

In [116]:
# Count number of records
planes_data.count()

5029

In [117]:
# Check the number of null in each column
planes_data_test_null = planes_data.select([sf.count(sf.when(sf.col(c).isin(['null', 'NULL', 'NA', 'NaN']) | sf.col(c).isNull(), c)).alias(c) for c in planes_data.columns])

In [118]:
planes_data_test_null.show()

+-------+----+------------+----------+-----+------+-------------+-----------+----+
|tailnum|type|manufacturer|issue_date|model|status|aircraft_type|engine_type|year|
+-------+----+------------+----------+-----+------+-------------+-----------+----+
|      0| 549|         549|       549|  549|   549|          549|        549| 549|
+-------+----+------------+----------+-----+------+-------------+-----------+----+



In [119]:
# Filter NULL values and show result
planes_data = planes_data.filter(planes_data.type.isNotNull())

In [120]:
planes_data.show(truncate=False)

+-------+-----------+----------------+----------+---------+------+-----------------------+-----------+----+
|tailnum|type       |manufacturer    |issue_date|model    |status|aircraft_type          |engine_type|year|
+-------+-----------+----------------+----------+---------+------+-----------------------+-----------+----+
|N10156 |Corporation|EMBRAER         |02/13/2004|EMB-145XR|Valid |Fixed Wing Multi-Engine|Turbo-Fan  |2004|
|N102UW |Corporation|AIRBUS INDUSTRIE|05/26/1999|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1998|
|N10323 |Corporation|BOEING          |07/01/1997|737-3TO  |Valid |Fixed Wing Multi-Engine|Turbo-Jet  |1986|
|N103US |Corporation|AIRBUS INDUSTRIE|06/18/1999|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1999|
|N104UA |Corporation|BOEING          |01/26/1998|747-422  |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1998|
|N104UW |Corporation|AIRBUS INDUSTRIE|07/02/1999|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1999|
|N10575 |Corporation|EMBRAER

In [121]:
# Check number of NULL values after removing them
planes_data.select([sf.count(sf.when(sf.col(c).isin(['null', 'NULL', 'NA', 'NaN']) | sf.col(c).isNull(), c)).alias(c) for c in planes_data.columns]).show()

+-------+----+------------+----------+-----+------+-------------+-----------+----+
|tailnum|type|manufacturer|issue_date|model|status|aircraft_type|engine_type|year|
+-------+----+------------+----------+-----+------+-------------+-----------+----+
|      0|   0|           0|         0|    0|     0|            0|          0|   0|
+-------+----+------------+----------+-----+------+-------------+-----------+----+



In [122]:
# Check number of records after removing NULL values
planes_data.count()

4480

In [123]:
# Check unique tailnum values
planes_data.select(sf.count(sf.col('tailnum'))).show()

+--------------+
|count(tailnum)|
+--------------+
|          4480|
+--------------+



In [125]:
# Convert columns to suitable data type
planes_data = planes_data.withColumns({'issue_date': sf.to_date(planes_data.issue_date, "MM/dd/yyyy"), 'year': sf.col('year').cast('int')})

In [126]:
planes_data.show(truncate=False)

+-------+-----------+----------------+----------+---------+------+-----------------------+-----------+----+
|tailnum|type       |manufacturer    |issue_date|model    |status|aircraft_type          |engine_type|year|
+-------+-----------+----------------+----------+---------+------+-----------------------+-----------+----+
|N10156 |Corporation|EMBRAER         |2004-02-13|EMB-145XR|Valid |Fixed Wing Multi-Engine|Turbo-Fan  |2004|
|N102UW |Corporation|AIRBUS INDUSTRIE|1999-05-26|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1998|
|N10323 |Corporation|BOEING          |1997-07-01|737-3TO  |Valid |Fixed Wing Multi-Engine|Turbo-Jet  |1986|
|N103US |Corporation|AIRBUS INDUSTRIE|1999-06-18|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1999|
|N104UA |Corporation|BOEING          |1998-01-26|747-422  |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1998|
|N104UW |Corporation|AIRBUS INDUSTRIE|1999-07-02|A320-214 |Valid |Fixed Wing Multi-Engine|Turbo-Fan  |1999|
|N10575 |Corporation|EMBRAER

In [128]:
# Check schema after type casting
planes_data.schema

StructType([StructField('tailnum', StringType(), True), StructField('type', StringType(), True), StructField('manufacturer', StringType(), True), StructField('issue_date', DateType(), True), StructField('model', StringType(), True), StructField('status', StringType(), True), StructField('aircraft_type', StringType(), True), StructField('engine_type', StringType(), True), StructField('year', IntegerType(), True)])

In [130]:
# Clean data, can be uploaded to BigQuery as Table
spark.conf.set('temporaryGcsBucket', 'data_expo_temp_bucket')
output_dataset = "data-expo-pipeline.data_expo_dataset.plane_table"

In [131]:
planes_data.write.format('bigquery').option('table', output_dataset).mode("overwrite").save()

                                                                                

In [132]:
spark.stop()