In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Bronze_NewsData")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [3]:
from pyspark.sql.functions import col, xxhash64
from pyspark.sql.types import StringType, IntegerType, DoubleType, LongType, FloatType, BooleanType

newsdata_data_path = "../../sanewsstorage/main/newsdata"

newsdata_df = (
    spark.read
         .option("recursiveFileLookup", "true")
         .parquet(newsdata_data_path)
)

for c in newsdata_df.columns:
    if "." in c:
        newsdata_df = newsdata_df.withColumnRenamed(c, c.replace(".", "_"))

primitive_types = (IntegerType, DoubleType, LongType, FloatType, BooleanType)

newsdata_df = newsdata_df.select([
    col(c).cast("string").alias(c)
    if isinstance(newsdata_df.schema[c].dataType, primitive_types)
    else col(c)
    for c in newsdata_df.columns
])

newsdata_df = newsdata_df.withColumn(
    "hash",
    xxhash64(*[col(c) for c in newsdata_df.columns])
)

newsdata_df.printSchema()
newsdata_df.show(20)

root
 |-- article_id: string (nullable = true)
 |-- link: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- content: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- creator: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- language: string (nullable = true)
 |-- country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- datatype: string (nullable = true)
 |-- pubDate: string (nullable = true)
 |-- pubDateTZ: string (nullable = true)
 |-- fetched_at: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- video_url: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- source_name: string (nullable = true)
 |-- source_priority: string (nullable = true)
 |-- source_url: string (nullable = t

In [4]:
from delta.tables import DeltaTable
import os

bronze_path = "../../sanewsstorage/bronze/deltatables/newsdata_delta"

if DeltaTable.isDeltaTable(spark, bronze_path):

    delta_table = DeltaTable.forPath(spark, bronze_path)

    (
        delta_table.alias("t")
        .merge(
            newsdata_df.alias("s"),
            "t.hash = s.hash"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        newsdata_df.write
          .format("delta")
          .mode("overwrite")
          .save(bronze_path)
    )

In [5]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, bronze_path)
df = delta_table.toDF()
df.show(5)

files = delta_table.toDF().inputFiles
print(files)


+--------------------+--------------------+-------------------------------------+-------------------------------------+--------------------+--------------------------------+--------------------+----------+----------+--------------------+--------+-------------------+---------+-------------------+--------------------+---------+------------+--------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+--------------------+
|          article_id|                link|                                title|                          description|             content|                        keywords|             creator|  language|   country|            category|datatype|            pubDate|pubDateTZ|         fetched_at|           image_url|video_url|   source_id|   source_name|source_priority|          source_url|         source_icon|           senti

In [6]:
spark.stop()