In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("ML_Clustering")

In [3]:
ml_cluster_path = os.path.join(project_root, "sanewsstorage/ml/clusters")
articles_path   = os.path.join(project_root, "sanewsstorage/gold/articles_final")

cluster_df  = spark.read.format("delta").load(ml_cluster_path)
articles_df = spark.read.format("delta").load(articles_path)

joined_df = (
    cluster_df
    .join(articles_df, on="bronze_hash", how="left")
)

joined_df.select("bronze_hash", "cluster_id", "clean_text").show(5, truncate=False)

+--------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|bronze_hash         |cluster_id|clean_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+-

In [6]:
from pyspark.sql.functions import lower, regexp_replace, col

df_clean = joined_df.withColumn(
    "text_processed",
    lower(regexp_replace(col("clean_text"), "[^a-zA-Z0-9\\s]", ""))
)

In [7]:
pdf = df_clean.select(
    "cluster_id",
    "text_processed"
).toPandas()

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

cluster_keywords = {}

for cid in pdf["cluster_id"].unique():

    texts = pdf[pdf["cluster_id"] == cid]["text_processed"]

    if len(texts) < 3:
        cluster_keywords[cid] = []
        continue

    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=50
    )

    X = vectorizer.fit_transform(texts)

    scores = X.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()

    top_idx = scores.argsort()[-10:][::-1]

    keywords = [terms[i] for i in top_idx]

    cluster_keywords[cid] = keywords

In [9]:
cluster_labels = {
    cid: ", ".join(words[:3])
    for cid, words in cluster_keywords.items()
}

In [10]:
import pandas as pd

label_pdf = pd.DataFrame({
    "cluster_id": list(cluster_labels.keys()),
    "cluster_label": list(cluster_labels.values()),
    "top_keywords": [
        ", ".join(cluster_keywords[cid])
        for cid in cluster_labels.keys()
    ]
})

label_spark_df = spark.createDataFrame(label_pdf)

In [12]:
final_df = joined_df.join(
    label_spark_df,
    on="cluster_id",
    how="left"
)

In [13]:
from delta.tables import DeltaTable

ml_cluster_labeled_path = "../../sanewsstorage/ml/clusters_labeled"

if DeltaTable.isDeltaTable(spark, ml_cluster_labeled_path):

    delta_table = DeltaTable.forPath(
        spark,
        ml_cluster_labeled_path
    )

    (
        delta_table.alias("t")
        .merge(
            final_df.alias("s"),
            "t.bronze_hash = s.bronze_hash"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        final_df.write
        .format("delta")
        .mode("overwrite")
        .save(ml_cluster_labeled_path)
    )


In [14]:
final_df.show(5)

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------+--------------------+-----------+--------------------+-------------------+---------+-------------------+----------------+--------------------+-----------------+--------------+--------------------+--------------------+--------------------+----------+----------+--------------------+--------------------+
|cluster_id|         bronze_hash|          article_id|               title|         description|             content|       published_at|language|                 url|   keywords|          categories|            creator|source_id|        source_name|ingestion_source|          clean_text|language_detected|language_final|            entities|           embedding| entities_normalized|     topic|  subtopic|       cluster_label|        top_keywords|
+----------+--------------------+--------------------+--------------------+--------------------+------