## 1. Tạo Class

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
class BaseTransformer:
    def __init__(self, sparkSession):
        self.spark = sparkSession
    def readData(self, path, format="json", schema=None):
        reader=self.spark.read.format(format)
        if schema:
            reader.schema(schema)
        df=reader.load(path)
        return df
    def writeData(self, df, path, format="json", dbPath=None, checkpointPath=None, mode="append", streaming=False):
        if streaming:
            writer = (
                df.writeStream
                .format(format)
                .option("checkpointLocation", checkpointPath)
                .outputMode(mode)
            )
            query = writer.toTable(dbPath)
            query.awaitTermination()
        else:
            (
                df.write
                .format(format)
                .mode(mode)
                .save(path)
            )
    def showShape(self, df):
        return (len(df.columns), df.count())

    def convertTimestamp(self, df, colName="created_utc", newCol="createdDate"):
        return df.withColumn(
            newCol,
            F.from_unixtime(F.col(colName)).cast("timestamp")
        )

    def markAuthorDeleted(self, df, authorCol="author", nameAuthor="author_fullname", newCol="accDeleted"):
        return df.withColumn(
            newCol,
            (F.col(authorCol) == "[deleted]") & F.col(nameAuthor).isNull()
        )

    def markBodyRemoved(self, df, body="selftext", newCol="postDeleted"):
        return df.withColumn(
            newCol,
            F.col(body) == "[removed]"
        )

In [3]:
class SubmissionTransformer(BaseTransformer):
    def __init__(self, sparkSession):
        super().__init__(sparkSession)

    def markSpamPost(self, df, titleCol="title", urlCol="url", domainCol="domain", subredditCol="subreddit", newCol="isPostSpam"):
        words = F.split(F.lower(F.col(titleCol)), " ")
        numMatched = F.size(
        F.expr(f"filter(split(lower({titleCol}), ' '), word -> instr(lower({urlCol}), word) > 0)")
        )        
        totalWord = F.size(words)
        matchPercent = (numMatched * 100) / totalWord
        isPostSpam = (matchPercent >= 55) & (
        F.col(domainCol) != F.concat(F.lit("self."), F.col(subredditCol))
        )
        
        return df.withColumn("matchPercent", matchPercent).withColumn(newCol, isPostSpam)        

    def transform(self, df):
        df = self.convertTimestamp(df)
        df = self.markAuthorDeleted(df) 
        df = self.markBodyRemoved(df)
        df = self.markSpamPost(df)
        return df

In [4]:
class GoldTransformer(BaseTransformer):
    def __init__(self, sparkSession, dfSubmission, dfComment):
        super().__init__(sparkSession)
        self.dfSubmission = dfSubmission
        self.dfComment = dfComment
    
    def createDimTime(self,timestampCol="createdDate"):
        dfTimestamp=self.dfSubmission.select(timestampCol).distinct().union(self.dfComment.select(timestampCol).distinct())
        dimTime=(dfTimestamp.withColumn("year", F.year(F.col(timestampCol)))
                            .withColumn("month", F.month(F.col(timestampCol)))
                            .withColumn("day", F.dayofmonth(F.col(timestampCol)))
                            .withColumn("hour", F.hour(F.col(timestampCol)))
                            .withColumn("day_of_week", F.dayofweek(F.col(timestampCol)))
                            .withColumnRenamed(timestampCol, "time_key"))
        return dimTime
        
    def createDimAuthor(self, authorName="author", authorKey="author_fullname"):
        dimAuthor=(self.dfSubmission.select(authorName, authorKey).distinct()
            .union(self.dfComment.select(authorName, authorKey).distinct())
            .distinct()
            .withColumnRenamed(authorName, "author_name")
            .withColumnRenamed(authorKey, "author_key"))
        return dimAuthor



## 2. Spark Session

In [15]:
spark = SparkSession.builder.appName("Transformer").getOrCreate()

## 3. Transform Submission Silver

In [7]:
pathSub = "../data/RS_reddit.jsonl"
rSubmissionBronzeSchema = StructType([
    StructField("id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("selftext", StringType(), True),
    StructField("url", StringType(), True),
    StructField("permalink", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("post_hint", StringType(), True),
    StructField("author", StringType(), True),
    StructField("author_fullname", StringType(), True),
    StructField("created_utc", LongType(), True),
    StructField("subreddit", StringType(), True),
    StructField("subreddit_id", StringType(), True),
    StructField("subreddit_name_prefixed", StringType(), True),
    StructField("subreddit_type", StringType(), True),
    StructField("subreddit_subscribers", IntegerType(), True),
    StructField("score", IntegerType(), True),
    StructField("num_comments", IntegerType(), True),
    StructField("total_awards_received", IntegerType(), True),
    StructField("edited", BooleanType(), True),
    StructField("locked", BooleanType(), True),
    StructField("spoiler", BooleanType(), True),
    StructField("over_18", BooleanType(), True),
    StructField("stickied", BooleanType(), True),   
    StructField("retrieved_on", LongType(), True),
    StructField("is_original_content", BooleanType(), True),
    StructField("link_flair_text", StringType(), True)
])


In [8]:
submission = SubmissionTransformer(spark)

In [9]:
dfSub = submission.readData(pathSub, schema=rSubmissionBronzeSchema)

In [13]:
submission.showShape(dfSub)

(31, 7306)

In [14]:
dfSub.show(2)

+-------+--------------------+---------+--------------------+--------------------+-------------------+---------+---------+---------------+-----------+----------+------------+-----------------------+--------------+---------------------+-----+------------+---------------------+------+------+-------+-------+--------+------------+-------------------+---------------+-------------------+----------+-----------+------------------+----------+
|     id|               title| selftext|                 url|           permalink|             domain|post_hint|   author|author_fullname|created_utc| subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|subreddit_subscribers|score|num_comments|total_awards_received|edited|locked|spoiler|over_18|stickied|retrieved_on|is_original_content|link_flair_text|        createdDate|accDeleted|postDeleted|      matchPercent|isPostSpam|
+-------+--------------------+---------+--------------------+--------------------+-------------------+---------+---------+--

In [12]:
dfSub = submission.transform(dfSub)

## 3. Transform Comment Silver

In [17]:
comment=SubmissionTransformer(spark)

In [18]:
pathCmt = "../data/RC_reddit.jsonl"

rCommentBronzeSchema = StructType([
    StructField("id", StringType(), True),
    StructField("body", StringType(), True),
    StructField("created_utc", LongType(), True),
    StructField("edited", BooleanType(), True),
    StructField("score", IntegerType(), True),
    StructField("author", StringType(), True),
    StructField("author_fullname", StringType(), True),
    StructField("author_created_utc", LongType(), True),
    StructField("parent_id", StringType(), True),
    StructField("link_id", StringType(), True),
    StructField("is_submitter", BooleanType(), True),
    StructField("permalink", StringType(), True),
    StructField("subreddit", StringType(), True),
    StructField("subreddit_id", StringType(), True),
    StructField("subreddit_name_prefixed", StringType(), True),
    StructField("subreddit_type", StringType(), True),
    StructField("total_awards_received", IntegerType(), True),
    StructField("controversiality", IntegerType(), True),
    StructField("retrieved_on", LongType(), True),
    StructField("stickied", BooleanType(), True)
])



In [20]:
dfCmt=comment.readData(pathCmt, schema=rCommentBronzeSchema)

In [22]:
comment.showShape(dfCmt)

(20, 65661)

In [25]:
dfCmt.show(2)

+-------+--------------------+-----------+------+-----+---------+---------------+------------------+----------+----------+------------+--------------------+----------+------------+-----------------------+--------------+---------------------+----------------+------------+--------+-------------------+
|     id|                body|created_utc|edited|score|   author|author_fullname|author_created_utc| parent_id|   link_id|is_submitter|           permalink| subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|total_awards_received|controversiality|retrieved_on|stickied|        createdDate|
+-------+--------------------+-----------+------+-----+---------+---------------+------------------+----------+----------+------------+--------------------+----------+------------+-----------------------+--------------+---------------------+----------------+------------+--------+-------------------+
|mvc2u2o|From the article\...| 1748737579| false|    0| Gari_305|    t2_65fa26pr|              NU

In [24]:
dfCmt=comment.convertTimestamp(dfCmt)

## 4. Gold Transform DimTime DimAuthor test 

In [27]:
goldTransform=GoldTransformer(spark, dfSub, dfCmt)

In [28]:
dimTime=goldTransform.createDimTime()

In [29]:
goldTransform.showShape(dimTime)

                                                                                

(6, 68404)

In [31]:
dimTime.show()

+-------------------+----+-----+---+----+-----------+
|           time_key|year|month|day|hour|day_of_week|
+-------------------+----+-----+---+----+-----------+
|2025-06-02 06:55:46|2025|    6|  2|   6|          2|
|2025-06-02 16:35:56|2025|    6|  2|  16|          2|
|2025-06-03 23:13:52|2025|    6|  3|  23|          3|
|2025-06-06 03:20:51|2025|    6|  6|   3|          6|
|2025-06-06 18:57:36|2025|    6|  6|  18|          6|
|2025-06-06 22:36:02|2025|    6|  6|  22|          6|
|2025-06-07 08:18:52|2025|    6|  7|   8|          7|
|2025-06-10 01:57:39|2025|    6| 10|   1|          3|
|2025-06-10 02:44:55|2025|    6| 10|   2|          3|
|2025-06-11 18:59:08|2025|    6| 11|  18|          4|
|2025-06-11 21:17:25|2025|    6| 11|  21|          4|
|2025-06-13 08:42:31|2025|    6| 13|   8|          6|
|2025-06-13 14:58:06|2025|    6| 13|  14|          6|
|2025-06-14 18:57:48|2025|    6| 14|  18|          7|
|2025-06-16 19:38:33|2025|    6| 16|  19|          2|
|2025-06-10 23:53:39|2025|  

In [32]:
dimAuthor=dimTime=goldTransform.createDimAuthor()

In [33]:
dimAuthor.show()



+--------------------+-------------+
|         author_name|   author_key|
+--------------------+-------------+
|        Wolseley1870|t2_1q409u707y|
|DazzlingTelevision52|  t2_ad3mhj41|
|             mitousa|  t2_2qa94c49|
|       jluizsouzadev|  t2_8soz5t21|
|          Wescoast64|t2_1oo7c8683g|
|              somove|  t2_ih021hhb|
|    Ok-Educator-2319| t2_xpr1s5zqj|
| macrohard_certified|  t2_vvflmagc|
| firestorm-the-maker|t2_1h0jzaw75f|
|     TopEstimate5475|t2_1rrxu8cqh0|
|             rkasper|     t2_towds|
|      hthekiller6400|  t2_7vlg2wn0|
|Abject-Persimmon-221|t2_1ob62361lq|
|            RaMaZi16|t2_1r8f48tvws|
|         saurabhs619|     t2_qsr8g|
|       luxurarybrand|t2_1qvdnchwwq|
|          hendrix616|    t2_11oao4|
|     No_Alfalfa_4687|t2_1p643zezm6|
| Additional-Hour6038|t2_1k33zn6zns|
|       travisslayton|  t2_botj5nxx|
+--------------------+-------------+
only showing top 20 rows



                                                                                

In [35]:
goldTransform.showShape(dimAuthor)

(2, 34161)