In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Gold_Entity_Topic")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

In [3]:
gold_enriched_path = "../../sanewsstorage/gold/articles_enriched"
gold_df = spark.read.format("delta").load(gold_enriched_path)

In [4]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

In [5]:
ENTITY_MAP = {
    "U.S.": "United States",
    "US": "United States",
    "USA": "United States",
    "U.K.": "United Kingdom",
    "UK": "United Kingdom"
}

In [6]:
def normalize_entities(entities):
    if entities is None:
        return []

    normalized = []

    for ent in entities:
        text = ent["entity"]
        label = ent["label"]

        canonical = ENTITY_MAP.get(text, text)

        normalized.append({
            "entity": canonical,
            "label": label
        })

    return normalized

In [7]:
norm_schema = ArrayType(
    StructType([
        StructField("entity", StringType(), True),
        StructField("label", StringType(), True)
    ])
)

normalize_udf = udf(normalize_entities, norm_schema)

In [8]:
gold_df = gold_df.withColumn(
    "entities_normalized",
    normalize_udf("entities")
)

In [9]:
TOPIC_KEYWORDS = {
    "Politics": ["election", "government", "minister", "parliament"],
    "Finance": ["stock", "market", "investment", "bank"],
    "Technology": ["ai", "software", "tech", "startup"],
    "Sports": ["match", "tournament", "cricket", "football"],
    "Health": ["covid", "vaccine", "hospital", "health"]
}

In [10]:
def classify_topic(text):

    if text is None:
        return "Other"

    text_lower = text.lower()

    for topic, keywords in TOPIC_KEYWORDS.items():
        for kw in keywords:
            if kw in text_lower:
                return topic

    return "Other"

In [11]:
from pyspark.sql.types import StringType

topic_udf = udf(classify_topic, StringType())

gold_df = gold_df.withColumn(
    "topic",
    topic_udf("clean_text")
)

In [12]:
gold_df = gold_df.withColumn(
    "subtopic",
    topic_udf("clean_text")
)

In [13]:
gold_df.select(
    "topic",
    "entities_normalized"
).show(10, truncate=False)

+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|topic     |entities_normalized                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+----------+----------------------------------------------------------------------

In [14]:
from delta.tables import DeltaTable

gold_final_path = "../../sanewsstorage/gold/articles_final"

if DeltaTable.isDeltaTable(spark, gold_final_path):

    delta_table = DeltaTable.forPath(spark, gold_final_path)

    (
        delta_table.alias("t")
        .merge(
            gold_df.alias("s"),
            "t.bronze_hash = s.bronze_hash"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        gold_df.write
        .format("delta")
        .mode("overwrite")
        .save(gold_final_path)
    )

In [15]:
gold_df.show(10)

+--------------------+--------------------+--------------------+--------------------+-------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-----------------+--------------+--------------------+--------------------+--------------------+----------+----------+
|          article_id|               title|         description|             content|       published_at|language|                 url|            keywords|          categories|             creator|           source_id|         source_name|ingestion_source|         bronze_hash|          clean_text|language_detected|language_final|            entities|           embedding| entities_normalized|     topic|  subtopic|
+--------------------+--------------------+--------------------+--------------------+-------------------+--------+--------------------+--------------------+--------