In [1]:
import sys
import os

project_root = os.path.abspath("../..")

if project_root not in sys.path:
    sys.path.append(project_root)

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable


In [2]:
from datapipeline.utils.spark_session import get_spark_session

spark = get_spark_session("Gold_ML_Aggregate")
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

In [4]:
silver_path = "../../sanewsstorage/silver/unified_news/deltatables/articles_silver"

gold_df = spark.read.format("delta").load(silver_path)

In [5]:
from pyspark.sql.functions import col, concat_ws

gold_df = gold_df.withColumn(
    "clean_text",
    concat_ws(
        " ",
        col("title"),
        col("description"),
        col("content")
    )
)

In [11]:
import fasttext

model_path = "../../models/lid.176.bin"

def detect_language(text):

    if text is None:
        return None

    text = text.strip()

    if text == "":
        return None

    text = text.replace("\n", " ").replace("\r", " ")

    if not hasattr(detect_language, "model"):
        detect_language.model = fasttext.load_model(model_path)

    prediction = detect_language.model.predict(text, k=1)
    label = prediction[0][0]

    return label.replace("__label__", "")

In [13]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

detect_lang_udf = udf(detect_language, StringType())

In [14]:
gold_df = gold_df.withColumn(
    "language_detected",
    detect_lang_udf(col("clean_text"))
)

In [15]:
from pyspark.sql.functions import when

gold_df = gold_df.withColumn(
    "language_final",
    when(
        col("language").isNull() | (col("language") == ""),
        col("language_detected")
    ).otherwise(col("language"))
)

In [3]:
import sys
print(spark.sparkContext.pythonExec)

c:\Users\Echelon\Desktop\re\sa-news\venv\Scripts\python.exe


In [16]:
gold_df.select(
    "language",
    "language_detected",
    "language_final"
).show(20, truncate=False)

+--------+-----------------+--------------+
|language|language_detected|language_final|
+--------+-----------------+--------------+
|NULL    |en               |en            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|NULL    |ja               |ja            |
|albanian|sq               |albanian      |
|albanian|sq               |albanian      |
|albanian|sq               |albanian      |
|NULL    |en               |en            |
|NULL    |en               |en            |
|NULL    |en               |en            |
|NULL    |en               |en            |
|NULL    |en               |en            |
|NULL    |en               |en            |
|NULL    |en               |en            |
|bengali |bn               |bengali       |
|bengali |bn               |beng

In [17]:
from delta.tables import DeltaTable

gold_lang_path = "../../sanewsstorage/gold/articles_lang"

if DeltaTable.isDeltaTable(spark, gold_lang_path):

    delta_table = DeltaTable.forPath(spark, gold_lang_path)

    (
        delta_table.alias("t")
        .merge(
            gold_df.alias("s"),
            "t.bronze_hash = s.bronze_hash"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:

    (
        gold_df.write
        .format("delta")
        .mode("overwrite")
        .save(gold_lang_path)
    )


In [18]:
spark.stop()