In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .config(
        "spark.jars",
        "/opt/bitnami/spark/jars/gcs-connector-hadoop3-latest.jar,"
        "/opt/bitnami/spark/jars/spark-bigquery-with-dependencies_2.12-0.42.4.jar"
        ) \
        .appName('gcs-bq-pyspark') \
        .getOrCreate()

25/08/12 12:42:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark.sparkContext.setLogLevel("WARN")

spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', "/opt/keys/credentials.json")

In [4]:
# File information
bucket_name = "data_expo_bucket"
file_name = "variable_descriptions_data.csv"
file_path = f"gs://{bucket_name}/{file_name}"

In [5]:
# Read CSV file
vardes_data = spark.read.option("inferSchema", "true").option("header", "true").csv(file_path)

                                                                                

In [7]:
# Show the data
vardes_data.show(truncate=False)

+---------------------+-----------------+--------------------------------------+
|Variable descriptions|_c1              |_c2                                   |
+---------------------+-----------------+--------------------------------------+
|NULL                 |NULL             |NULL                                  |
|NULL                 |Name             |Description                           |
|1                    |Year             |1987-2008                             |
|2                    |Month            |12-Jan                                |
|3                    |DayofMonth       |31-Jan                                |
|4                    |DayOfWeek        |1 (Monday) - 7 (Sunday)               |
|5                    |DepTime          |actual departure time (local, hhmm)   |
|6                    |CRSDepTime       |scheduled departure time (local, hhmm)|
|7                    |ArrTime          |actual arrival time (local, hhmm)     |
|8                    |CRSAr

                                                                                

In [9]:
# Read file schema
vardes_data.schema

StructType([StructField('Variable descriptions', IntegerType(), True), StructField('_c1', StringType(), True), StructField('_c2', StringType(), True)])

In [10]:
# Count number of records
vardes_data.count()

31

In [13]:
# Rename the DataFrame
vardes_data = vardes_data.withColumnRenamed("_c1", "Name").withColumnRenamed("_c2", "Description")

In [15]:
# DataFrame after renaming
vardes_data.show(truncate=False)

+---------------------+-----------------+--------------------------------------+
|Variable descriptions|Name             |Description                           |
+---------------------+-----------------+--------------------------------------+
|NULL                 |NULL             |NULL                                  |
|NULL                 |Name             |Description                           |
|1                    |Year             |1987-2008                             |
|2                    |Month            |12-Jan                                |
|3                    |DayofMonth       |31-Jan                                |
|4                    |DayOfWeek        |1 (Monday) - 7 (Sunday)               |
|5                    |DepTime          |actual departure time (local, hhmm)   |
|6                    |CRSDepTime       |scheduled departure time (local, hhmm)|
|7                    |ArrTime          |actual arrival time (local, hhmm)     |
|8                    |CRSAr

                                                                                

In [16]:
# Remove rows with NULL
vardes_data = vardes_data.filter(sf.col("Variable descriptions").isNotNull())

In [18]:
# DataFrame after removing NULL values
vardes_data.show(truncate=False)

+---------------------+-----------------+--------------------------------------+
|Variable descriptions|Name             |Description                           |
+---------------------+-----------------+--------------------------------------+
|1                    |Year             |1987-2008                             |
|2                    |Month            |12-Jan                                |
|3                    |DayofMonth       |31-Jan                                |
|4                    |DayOfWeek        |1 (Monday) - 7 (Sunday)               |
|5                    |DepTime          |actual departure time (local, hhmm)   |
|6                    |CRSDepTime       |scheduled departure time (local, hhmm)|
|7                    |ArrTime          |actual arrival time (local, hhmm)     |
|8                    |CRSArrTime       |scheduled arrival time (local, hhmm)  |
|9                    |UniqueCarrier    |unique carrier code                   |
|10                   |Fligh

                                                                                

In [20]:
# Number of records after removing NULL values
vardes_data.count()

                                                                                

29

In [22]:
# Push data after modification to BigQuery as Table
spark.conf.set('temporaryGcsBucket', 'data_expo_temp_bucket')
output_dataset = "data-expo-pipeline.data_expo_dataset.variable_description_table"

In [23]:
vardes_data.write.format('bigquery').option('table', output_dataset).mode("overwrite").save()

                                                                                

In [24]:
spark.stop()