In [1]:
import sys
import os

project_root = os.path.abspath("../..")
if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [2]:
from datapipeline.utils.spark_session import get_spark_session
spark = get_spark_session("ML_Clustering")

In [3]:
from pyspark.sql.functions import lower, regexp_replace, col
from delta.tables import DeltaTable
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
ml_cluster_path = os.path.join(project_root, "sanewsstorage/ml/clusters")
articles_path   = os.path.join(project_root, "sanewsstorage/gold/articles_final")
ml_cluster_labeled_path = os.path.join(project_root, "sanewsstorage/ml/clusters_labeled")

In [5]:
cluster_df  = spark.read.format("delta").load(ml_cluster_path)
articles_df = spark.read.format("delta").load(articles_path)

joined_df = (
    cluster_df
    .join(
        articles_df.select("bronze_hash", "clean_text"),
        on="bronze_hash",
        how="left"
    )
)

In [6]:
df_clean = joined_df.withColumn(
    "text_processed",
    lower(regexp_replace(col("clean_text"), "[^a-zA-Z0-9\\s]", ""))
)

pdf = df_clean.select(
    "cluster_id",
    "text_processed"
).toPandas()

In [7]:
cluster_keywords = {}

for cid in pdf["cluster_id"].unique():
    texts = pdf[pdf["cluster_id"] == cid]["text_processed"]

    if len(texts) < 3:
        cluster_keywords[cid] = []
        continue

    vectorizer = TfidfVectorizer(
        stop_words="english",
        max_features=50
    )

    X = vectorizer.fit_transform(texts)

    scores = X.mean(axis=0).A1
    terms = vectorizer.get_feature_names_out()

    top_idx = scores.argsort()[-10:][::-1]
    keywords = [terms[i] for i in top_idx]

    cluster_keywords[cid] = keywords

In [8]:
label_pdf = pd.DataFrame({
    "cluster_id": list(cluster_keywords.keys()),
    "cluster_label": [
        ", ".join(cluster_keywords[cid][:3])
        for cid in cluster_keywords.keys()
    ],
    "top_keywords": [
        ", ".join(cluster_keywords[cid])
        for cid in cluster_keywords.keys()
    ]
})

label_spark_df = spark.createDataFrame(label_pdf)

In [9]:
if DeltaTable.isDeltaTable(spark, ml_cluster_labeled_path):

    delta_table = DeltaTable.forPath(
        spark,
        ml_cluster_labeled_path
    )

    (
        delta_table.alias("t")
        .merge(
            label_spark_df.alias("s"),
            "t.cluster_id = s.cluster_id"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        label_spark_df.write
        .format("delta")
        .mode("overwrite")
        .save(ml_cluster_labeled_path)
    )

In [10]:
label_spark_df.show(5)

+----------+--------------------+--------------------+
|cluster_id|       cluster_label|        top_keywords|
+----------+--------------------+--------------------+
|        16|   chars, bowl, news|chars, bowl, news...|
|         9|plans, paid, avai...|plans, paid, avai...|
|         8|chars, available,...|chars, available,...|
|        14|available, plans,...|available, plans,...|
|        11|plans, paid, avai...|plans, paid, avai...|
+----------+--------------------+--------------------+
only showing top 5 rows



In [11]:
spark.stop()