In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Bronze_NewsAPI")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [3]:
from pyspark.sql.functions import col, xxhash64
from pyspark.sql.types import StringType, IntegerType, DoubleType, LongType, FloatType, BooleanType

newsapi_data_path = "../../sanewsstorage/main/newsapi"

newsapi_df = (
    spark.read
         .option("recursiveFileLookup", "true")
         .parquet(newsapi_data_path)
)

for c in newsapi_df.columns:
    if "." in c:
        newsapi_df = newsapi_df.withColumnRenamed(c, c.replace(".", "_"))

primitive_types = (IntegerType, DoubleType, LongType, FloatType, BooleanType)

newsapi_df = newsapi_df.select([
    col(c).cast("string").alias(c)
    if isinstance(newsapi_df.schema[c].dataType, primitive_types)
    else col(c)
    for c in newsapi_df.columns
])

newsapi_df = newsapi_df.withColumn(
    "hash",
    xxhash64(*[col(c) for c in newsapi_df.columns])
)

newsapi_df.printSchema()
newsapi_df.show(20)

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- urlToImage: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- content: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- source_name: string (nullable = true)
 |-- hash: long (nullable = false)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+
|              author|               title|         description|                 url|          urlToImage|         publishedAt|             content|      source_id|         source_name|                hash|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------

In [4]:
from delta.tables import DeltaTable
import os

bronze_path = "../../sanewsstorage/bronze/deltatables/newsapi_delta"

if DeltaTable.isDeltaTable(spark, bronze_path):

    delta_table = DeltaTable.forPath(spark, bronze_path)

    (
        delta_table.alias("t")
        .merge(
            newsapi_df.alias("s"),
            "t.hash = s.hash"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        newsapi_df.write
          .format("delta")
          .mode("overwrite")
          .save(bronze_path)
    )

In [5]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, bronze_path)
df = delta_table.toDF()
df.show(5)

files = delta_table.toDF().inputFiles
print(files)


+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|              author|               title|         description|                 url|          urlToImage|         publishedAt|             content|         source_id|       source_name|                hash|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+
|Tara Prindiville,...|Trump administrat...|A new rule issued...|https://www.nbcne...|https://media-cld...|2026-02-06T00:59:19Z|WASHINGTON The Tr...|          nbc-news|          NBC News|-5159804459787423082|
|   Ellen Chamberlain|Bain Barbecue ope...|The Cooper-Young ...|https://dailymemp...|https://thememphi...|2026-02-06T00:58:39Z|Ellen Chamberlain...|              NULL|T

In [6]:
spark.stop()