In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Silver_CombineDeltas")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [3]:
newsapi_path      = "../../sanewsstorage/bronze/deltatables/newsapi_delta"
newsdata_path     = "../../sanewsstorage/bronze/deltatables/newsdata_delta"
newsdatahub_path  = "../../sanewsstorage/bronze/deltatables/newsdatahub_delta"

newsapi_df     = spark.read.format("delta").load(newsapi_path)
newsdata_df    = spark.read.format("delta").load(newsdata_path)
newsdatahub_df = spark.read.format("delta").load(newsdatahub_path)

In [4]:
from pyspark.sql.functions import col, to_timestamp, to_utc_timestamp, lit, array

newsapi_silver = (
    newsapi_df
    .withColumn("article_id", col("url"))
    .withColumn(
        "published_at",
        to_utc_timestamp(
            to_timestamp("publishedAt"),
            "UTC"
        )
    )
    .withColumn("language", lit(None).cast("string"))
    .withColumn("keywords", lit(None).cast("array<string>"))
    .withColumn("categories", lit(None).cast("array<string>"))
    .withColumn("creator", array(col("author")))
    .withColumn("ingestion_source", lit("newsapi"))
    .withColumnRenamed("hash", "bronze_hash")
    .select(
        "article_id","title","description","content",
        "published_at","language","url",
        "keywords","categories","creator",
        "source_id","source_name",
        "ingestion_source","bronze_hash"
    )
)

In [5]:
newsdata_silver = (
    newsdata_df
    .withColumn("article_id", col("article_id"))
    .withColumn(
        "published_at",
        to_utc_timestamp(
            to_timestamp("pubDate"),
            "UTC"
        )
    )
    .withColumn("url", col("link"))
    .withColumn("categories", col("category"))
    .withColumn("ingestion_source", lit("newsdata"))
    .withColumnRenamed("hash", "bronze_hash")
    .select(
        "article_id","title","description","content",
        "published_at","language","url",
        "keywords","categories","creator",
        "source_id","source_name",
        "ingestion_source","bronze_hash"
    )
)


In [6]:
newsdatahub_silver = (
    newsdatahub_df
    .withColumn("article_id", col("id"))
    .withColumn(
        "published_at",
        to_utc_timestamp(
            to_timestamp("pub_date"),
            "UTC"
        )
    )
    .withColumn("url", col("article_link"))
    .withColumn("categories", col("topics"))
    .withColumn("creator", array(col("creator")))
    .withColumn("keywords", col("keywords"))
    .withColumn("country", array(col("source_country")))
    .withColumn("ingestion_source", lit("newsdatahub"))
    .withColumnRenamed("hash", "bronze_hash")
    .select(
        "article_id","title","description","content",
        "published_at","language","url",
        "keywords","categories","creator",
        "source_id","source_title",
        "ingestion_source","bronze_hash"
    )
    .withColumnRenamed("source_title","source_name")
)

In [7]:
silver_union = (
    newsapi_silver
    .unionByName(newsdata_silver, allowMissingColumns=True)
    .unionByName(newsdatahub_silver, allowMissingColumns=True)
)

In [8]:
silver_dedup = (
    silver_union
    .dropDuplicates(["url"])
)

In [9]:
silver_dedup.count()

27663

In [43]:
from delta.tables import DeltaTable
import os

silver_path = "../../sanewsstorage/silver/unified_news/deltatables/articles_silver"

if DeltaTable.isDeltaTable(spark, silver_path):

    silver_delta = DeltaTable.forPath(spark, silver_path)

    (
        silver_delta.alias("t")
        .merge(
            silver_dedup.alias("s"),
            "t.bronze_hash = s.bronze_hash"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        silver_dedup.write
        .format("delta")
        .mode("overwrite")
        .save(silver_path)
    )


In [44]:
silver_df = spark.read.format("delta").load(silver_path)

In [47]:
silver_df.show(5)

+--------------------+-------------------------------------+----------------------------+--------------------+-------------------+--------+--------------------+--------+----------+---------------+---------+--------------+----------------+--------------------+
|          article_id|                                title|                 description|             content|       published_at|language|                 url|keywords|categories|        creator|source_id|   source_name|ingestion_source|         bronze_hash|
+--------------------+-------------------------------------+----------------------------+--------------------+-------------------+--------+--------------------+--------+----------+---------------+---------+--------------+----------------+--------------------+
|http://9to5google...|                 Samsung is confid...|        Insider reports i...|Insider reports i...|2026-02-03 20:44:18|    NULL|http://9to5google...|    NULL|      NULL|[Andrew Romero]|     NULL|9to5google.com

In [46]:
from pyspark.sql.functions import col, concat_ws

df_fixed = silver_df.withColumn("keywords", concat_ws(",", col("keywords")))
df_fixed = df_fixed.withColumn("categories", concat_ws(",", col("categories")))
df_fixed = df_fixed.withColumn("creator", concat_ws(",", col("creator")))

df_fixed.limit(20).coalesce(1).write.mode("overwrite").option("header", "true").csv("top20_csv")

In [10]:
spark.stop()