# PySpark → DataHub (openlineage)

Read from the staging HackerNews RSS tables (`stg_hot_articles`, `stg_newest_articles`) and write
the combined result into the `hackernews_rss` dataset on BigQuery.

### SparkSession Setup

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark_jars = ",".join([
    "com.google.cloud.spark:spark-4.0-bigquery:0.44.0",
    "io.openlineage:openlineage-spark_2.13:1.43.0",
])

In [3]:
spark = (
    SparkSession.builder
    .appName("jupyter-openlineage")
    .master("local[*]")
    .config("spark.jars.packages", spark_jars)
    .config("spark.driver.extraJavaOptions",
        "--add-opens=java.base/java.security=ALL-UNNAMED "
        "--add-opens=java.base/java.lang=ALL-UNNAMED "
        "--add-opens=java.base/java.lang.invoke=ALL-UNNAMED "
        "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED "
        "--add-opens=java.base/java.nio=ALL-UNNAMED "
        "--add-opens=java.base/java.util=ALL-UNNAMED"
    )
    .config('spark.extraListeners', 'io.openlineage.spark.agent.OpenLineageSparkListener')
    .config('spark.openlineage.transport.type', 'http')
    .config('spark.openlineage.transport.url', 'http://localhost:9090')
    .config('spark.openlineage.transport.endpoint', '/openapi/openlineage/api/v1/lineage')
    .getOrCreate()
)

:: loading settings :: url = jar:file:/Users/iobruno/Vault/data-catalog-labs/spark/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/iobruno/.ivy2.5.2/cache
The jars for the packages stored in: /Users/iobruno/.ivy2.5.2/jars
com.google.cloud.spark#spark-4.0-bigquery added as a dependency
io.openlineage#openlineage-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8edce058-d8ec-421e-a61a-096e0521dabc;1.0
	confs: [default]
	found com.google.cloud.spark#spark-4.0-bigquery;0.44.0 in central
	found com.google.cloud.spark#spark-bigquery-dsv2-common;0.44.0 in central
	found com.google.cloud.spark#spark-bigquery-connector-common;0.44.0 in central
	found com.google.cloud.spark#bigquery-connector-common;0.44.0 in central
	found com.google.api.grpc#grpc-google-cloud-bigquerystorage-v1;3.16.1 in central
	found io.grpc#grpc-api;1.74.0 in central
	found com.googl

### Read Staging Tables

In [4]:
GCP_PROJECT = "iobruno-gcp-labs"
STG_DATASET = "stg_hackernews_rss"

df_hot = (
    spark.read
    .format("bigquery")
    .option("table", f"{GCP_PROJECT}.{STG_DATASET}.stg_hot_articles")
    .option("viewsEnabled", "true")
    .option("materializationDataset", STG_DATASET)
    .load()
)

df_hot.printSchema()
df_hot.show(5, truncate=False)

root
 |-- uid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- username: string (nullable = true)
 |-- url: string (nullable = true)
 |-- redirect_url: string (nullable = true)
 |-- published_at: timestamp (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+--------------------------------+--------------------------------------------------------------------------------+-----------+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+-------------------+
|uid                             |title                                                                           |username   |url                                          |redirect_url                                                                                                           |published_at       |
+--------------------------------+--------------------------------------------------------------------------------+-----------+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+-------------------+
|7b54e42263906dd156b7743931038f1e|Byte magazine artist Rob

                                                                                

In [5]:
df_newest = (
    spark.read
    .format("bigquery")
    .option("table", f"{GCP_PROJECT}.{STG_DATASET}.stg_newest_articles")
    .option("viewsEnabled", "true")
    .option("materializationDataset", STG_DATASET)
    .load()
)

df_newest.printSchema()
df_newest.show(5, truncate=False)

root
 |-- uid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- username: string (nullable = true)
 |-- url: string (nullable = true)
 |-- redirect_url: string (nullable = true)
 |-- published_at: timestamp (nullable = true)

+--------------------------------+-------------------------------------------------------------------------------+------------+---------------------------------------------+---------------------------------------------------------------------------------------------------------+-------------------+
|uid                             |title                                                                          |username    |url                                          |redirect_url                                                                                             |published_at       |
+--------------------------------+-------------------------------------------------------------------------------+------------+-----------------------------

                                                                                

### Combine Articles

In [6]:
from pyspark.sql.functions import lit

df_hot_tagged = df_hot.withColumn("source_feed", lit("hot"))
df_newest_tagged = df_newest.withColumn("source_feed", lit("newest"))

df_articles = df_hot_tagged.unionByName(df_newest_tagged)

print(f"Total articles: {df_articles.count()}")
df_articles.show(10, truncate=False)

Total articles: 40
+--------------------------------+--------------------------------------------------------------------------------+-----------+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+-------------------+-----------+
|uid                             |title                                                                           |username   |url                                          |redirect_url                                                                                                           |published_at       |source_feed|
+--------------------------------+--------------------------------------------------------------------------------+-----------+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+-------------------+-----------+
|7b

### Write to BigQuery

In [7]:
TARGET_DATASET = "hackernews_rss"
TARGET_TABLE = f"{GCP_PROJECT}.{TARGET_DATASET}.articles_spark"

(
    df_articles.write
    .format("bigquery")
    .option("table", TARGET_TABLE)
    .option("writeMethod", "direct")
    .mode("overwrite")
    .save()
)

print(f"Written to {TARGET_TABLE}")

                                                                                

Written to iobruno-gcp-labs.hackernews_rss.articles_spark


### Cleanup

In [8]:
spark.stop()