In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

In [30]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Bronze_NewsDataHub")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [24]:
from pyspark.sql.functions import col, xxhash64
from pyspark.sql.types import StringType, IntegerType, DoubleType, LongType, FloatType, BooleanType

newsdatahub_data_path = "../../sanewsstorage/main/newsdatahub"

newsdatahub_df = (
    spark.read
         .option("recursiveFileLookup", "true")
         .parquet(newsdatahub_data_path)
)

for c in newsdatahub_df.columns:
    if "." in c:
        newsdatahub_df = newsdatahub_df.withColumnRenamed(c, c.replace(".", "_"))

primitive_types = (IntegerType, DoubleType, LongType, FloatType, BooleanType)

newsdatahub_df = newsdatahub_df.select([
    col(c).cast("string").alias(c)
    if isinstance(newsdatahub_df.schema[c].dataType, primitive_types)
    else col(c)
    for c in newsdatahub_df.columns
])

newsdatahub_df = newsdatahub_df.withColumn(
    "hash",
    xxhash64(*[col(c) for c in newsdatahub_df.columns])
)

newsdatahub_df.printSchema()
newsdatahub_df.show(20)

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- source_title: string (nullable = true)
 |-- source_link: string (nullable = true)
 |-- article_link: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- topics: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- pub_date: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- content: string (nullable = true)
 |-- media_url: string (nullable = true)
 |-- media_type: string (nullable = true)
 |-- media_description: string (nullable = true)
 |-- media_credit: string (nullable = true)
 |-- media_thumbnail: string (nullable = true)
 |-- language: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- source_country: string (nullable = true)
 |-- source_political_leaning: string (nullable = true)
 |-- source_reliability_score: string (nullable = true

In [27]:
from delta.tables import DeltaTable
import os

bronze_path = "../../sanewsstorage/bronze/deltatables/newsdatahub_delta"

if DeltaTable.isDeltaTable(spark, bronze_path):

    delta_table = DeltaTable.forPath(spark, bronze_path)

    (
        delta_table.alias("t")
        .merge(
            newsdatahub_df.alias("s"),
            "t.hash = s.hash"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    (
        newsdatahub_df.write
          .format("delta")
          .mode("overwrite")
          .save(bronze_path)
    )

In [28]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, bronze_path)
df = delta_table.toDF()
df.show(5)

files = delta_table.toDF().inputFiles
print(files)


+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------+---------+--------------+------------------------+------------------------+---------------+--------------------+
|                  id|               title|     source_title|         source_link|        article_link|            keywords|              topics|         description|           pub_date|          creator|             content|           media_url|media_type|   media_description|        media_credit|     media_thumbnail|language|source_id|source_country|source_political_leaning|source_reliability_score|    source_type|                hash|
+--------------------+--------------------+-----------------+--------------------+--------------------+-------------

In [31]:
spark.stop()