## 1. Tạo Class

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class BaseTransformer:
    def __init__(self, sparkSession):
        self.spark = sparkSession
    def readData(self, path, format="json", schema=None):
        reader=self.spark.read.format(format)
        if schema:
            reader.schema(schema)
        df=reader.load(path)
        return df
    def writeData(self, df, path, format="json", dbPath=None, checkpointPath=None, mode="append", streaming=False):
        if streaming:
            writer = (
                df.writeStream
                .format(format)
                .option("checkpointLocation", checkpointPath)
                .outputMode(mode)
            )
            query = writer.toTable(dbPath)
            query.awaitTermination()
        else:
            (
                df.write
                .format(format)
                .mode(mode)
                .save(path)
            )
    def showShape(self, df):
        return (len(df.columns), df.count())

    def convertTimestamp(self, df, colName="created_utc", newCol="createdDate"):
        return df.withColumn(
            newCol,
            F.from_unixtime(F.col(colName)).cast("timestamp")
        )

    def markAuthorDeleted(self, df, authorCol="author", nameAuthor="author_fullname", newCol="accDeleted"):
        return df.withColumn(
            newCol,
            (F.col(authorCol) == "[deleted]") & F.col(nameAuthor).isNull()
        )

    def markBodyRemoved(self, df, body="selftext", newCol="postDeleted"):
        return df.withColumn(
            newCol,
            F.col(body) == "[removed]"
        )

In [3]:
class SubmissionTransformer(BaseTransformer):
    def __init__(self, sparkSession):
        super().__init__(sparkSession)

    def markSpamPost(self, df, titleCol="title", urlCol="url", domainCol="domain", subredditCol="subreddit", newCol="isPostSpam"):
        words = F.split(F.lower(F.col(titleCol)), " ")
        numMatched = F.size(
        F.expr(f"filter(split(lower({titleCol}), ' '), word -> instr(lower({urlCol}), word) > 0)")
        )        
        totalWord = F.size(words)
        matchPercent = (numMatched * 100) / totalWord
        isPostSpam = (matchPercent >= 55) & (
        F.col(domainCol) != F.concat(F.lit("self."), F.col(subredditCol))
        )
        
        return df.withColumn("matchPercent", matchPercent).withColumn(newCol, isPostSpam)        

    def transform(self, df):
        df = self.convertTimestamp(df)
        df = self.markAuthorDeleted(df) 
        df = self.markBodyRemoved(df)
        df = self.markSpamPost(df)
        return df

In [4]:
class CommentTransformer(BaseTransformer):
    def __init__(self, sparkSession):
        super().__init__(sparkSession)

    def normalizeParentId(self, df, parentIdCol="parent_id", newCol="parent_clean"):
        """Removes prefixes like 't3_' and 't1_' from parent_id."""
        return df.withColumn(
            newCol,
            F.regexp_replace(F.col(parentIdCol), r"^(t[13]_)", "")
        )

    def normalizeLinkId(self, df, linkIdCol="link_id", newCol="link_clean"):
        """Removes the 't3_' prefix from link_id."""
        return df.withColumn(
            newCol,
            F.regexp_replace(F.col(linkIdCol), r"^t3_", "")
        )
    
    def markModComments(self, df, authorCol="author", newColMod="deleted_by_mod", newColAutoMod="deleted_by_auto"):
        """Marks comments from Mod Team or AutoModerator."""
        df_with_mod = df.withColumn(
            newColMod,
            F.lower(F.col(authorCol)).like("%-modteam")
        )
        return df_with_mod.withColumn(
            newColAutoMod,
            F.col(authorCol) == "AutoModerator"
        )
    def transform(self, df):
        df = self.convertTimestamp(df)
        df = self.markAuthorDeleted(df) 
        df = self.markBodyRemoved(df, body="body", newCol="cmtDeleted")
        df = self.normalizeParentId(df)
        df = self.normalizeLinkId(df)
        df = self.markModComments(df)
        return df

In [5]:

# client = Client("dataguychill/sentiment")

# result = client.predict(
#     "hey, i love you!",    
#     api_name="/predict"        
# )

# print(result)


In [6]:
# spark = SparkSession.builder \
#     .appName("Test df") \
#     .getOrCreate()

# def predict_sentiment(text):
#     try:
#         client = Client("dataguychill/sentiment")
#         result = client.predict(text, api_name="/predict")
#         return result
#     except Exception as e:
#         return f"Error: {str(e)}"

# sentimentUdf = udf(predict_sentiment, StringType())

# data = [("suck the fuck",), ("I love uuu",), ("halo world",)]
# dfTest = spark.createDataFrame(data, ["body"])

# dfTest = dfTest.withColumn("sentiment", sentimentUdf(col("body")))

# dfTest.show(truncate=False)

# spark.stop()

NameError: name 'spark' is not defined

In [7]:
def predict_sentiment(text):
    try:
        client = Client("dataguychill/sentiment")
        result = client.predict(text, api_name="/predict")
        return result
    except Exception as e:
        return f"Error: {str(e)}"

sentimentUdf = udf(predict_sentiment, StringType())

In [67]:
class GoldTransformer(BaseTransformer):
    def __init__(self, sparkSession, dfSubmission, dfComment):
        super().__init__(sparkSession)
        self.dfSubmission = dfSubmission
        self.dfComment = dfComment
    
    def createDimTime(self,timestampCol="createdDate"):
        dfTimestamp=(self.dfSubmission.select(timestampCol).distinct().union(self.dfComment.select(timestampCol).distinct())).distinct()
        dimTime=(dfTimestamp.withColumn("year", F.year(F.col(timestampCol)))
                            .withColumn("month", F.month(F.col(timestampCol)))
                            .withColumn("day", F.dayofmonth(F.col(timestampCol)))
                            .withColumn("hour", F.hour(F.col(timestampCol)))
                            .withColumn("minute", F.minute(F.col(timestampCol))) 
                            .withColumn("second", F.second(F.col(timestampCol))) 
                            .withColumn("day_of_week", F.dayofweek(F.col(timestampCol)))
                            .withColumnRenamed("createdDate", "time_key"))
        return dimTime
        
    def createDimAuthor(self, authorName="author", fullName="author_fullname"):
        dimAuthor=(self.dfSubmission.select(authorName, fullName).distinct()
            .union(self.dfComment.select(authorName, fullName).distinct())
            .distinct()
            .withColumnRenamed("author_fullname", "author_key"))
        return dimAuthor

    def createDimSubreddit(self, subredditId="subreddit_id", subredditName="subreddit", subredditNamePrefixed="subreddit_name_prefixed", subredditType="subreddit_type"):
        dimSubreddit=(self.dfSubmission.select(
                                        F.col(subredditId),
                                        F.col(subredditName),
                                        F.col(subredditNamePrefixed),
                                        F.col(subredditType))
                                    .withColumnRenamed("subreddit_id", "subreddit_key")
                                      .distinct())
        return dimSubreddit

    def createDimPostType(self, postType="post_hint"):
        dimPostType=(self.dfSubmission.select(F.col(postType)).distinct()
            .withColumn("postType_key", F.monotonically_increasing_id()))
        return dimPostType
    
    def createDimSentiment(self):
        sentiment_labels = ["negative", "positive", "neutral"]
        data = [(label,) for label in sentiment_labels]
        sentiment_schema = StructType([StructField("sentiment_label", StringType(), True)])
        
        dimSentiment = (self.spark.createDataFrame(data, schema=sentiment_schema)
                        .withColumn("sentiment_key", F.monotonically_increasing_id()))
        return dimSentiment

    def createDimPost (self):
        dfSub = self.dfSubmission

        
        dfCmt=self.dfComment

        dfCmt = dfCmt.select("*").where((F.col("deleted_by_mod") == True) | (F.col("deleted_by_auto") == True)).dropDuplicates(["link_clean"])

        dimJoinRaw = (
            dfSub.join(dfCmt, dfSub["id"] == dfCmt["link_clean"], "left")
                  .select(
                      dfSub["id"],
                      dfSub["title"],
                      dfSub["selftext"],
                      dfSub["permalink"],
                      dfSub["url"],
                      dfSub["domain"],
                      dfSub["edited"],
                      dfSub["locked"],
                      dfSub["spoiler"],
                      dfSub["over_18"],
                      dfSub["stickied"],
                      dfSub["is_original_content"],
                      dfSub["link_flair_text"],
                      dfSub["accDeleted"],
                      dfSub["isPostSpam"],
                      dfSub["matchPercent"],
                      dfSub["postDeleted"],
                      dfCmt["deleted_by_mod"],
                      dfCmt["deleted_by_auto"]
                  )
        )                                                             
        dimPost=(dimJoinRaw.withColumn("postStatus",
                                         F.when(F.col("accDeleted")==True, F.lit("AuthorDeleted"))
                                          .when((F.col ("postDeleted")==True) & (F.col("accDeleted")==False), F.lit("SelfDeleted"))
                                         .when(F.col("deleted_by_mod")==True, F.lit("ModDeleted"))
                                         .when(F.col("deleted_by_auto")==True, F.lit("AutoDeleted"))
                                            .otherwise(F.lit("active")))
                                .withColumnRenamed("selftext", "body")
                                .drop("deleted_by_mod", "deleted_by_auto")
                                        .withColumnRenamed("id", "post_key"))
        


        return dimPost
    
    def createDimComment(self):
        dfCmt=self.dfComment
       
        
        dimComment=(dfCmt.select(F.col("id"), F.col("body"), F.col("permalink")
                                         , F.col("edited"), F.col("is_submitter")
                                         , F.col("controversiality"))
                                        # , F.col("sentiment_label"))
                                .withColumnRenamed("id", "comment_key"))

        
        return dimComment



    def createFactPostActivity(self,dimTime, dimAuthor, dimSubreddit, dimPostType, dimSentiment):
        
        dfSub = (self.dfSubmission
            .withColumn("text", F.concat_ws(" ", F.col("title"), F.col("selftext")))
            .withColumn("sentiment_label", sentimentUdf(F.col("text")))
        )

        dfPostActi = (dfSub.select(F.col("id"),  col("author_fullname"), col("createdDate"),
                                                col("subreddit"), col("post_hint"),
                                                col("score"), col("num_comments"),
                                               col("total_awards_received"),
                                               col("sentiment_label"),
                                              col("subreddit_subscribers")))
        
        factActi=dfPostActi.join(dimTime, dimTime.time_key==dfPostActi.createdDate,
                                 "left").drop("createdDate")
        factActi=factActi.join(dimAuthor, dimAuthor.author_key==factActi.author_fullname,
                               "left").drop("author_fullname")
        factActi=factActi.join(dimSubreddit, dimSubreddit.subreddit==factActi.subreddit, "left").drop("subreddit")
        factActi=factActi.join(dimPostType, dimPostType.post_hint==factActi.post_hint, "left").drop("post_hint")

        factActi=factActi.join(dimSentiment, dimSentiment.sentiment_label==factActi.sentiment_label, 
                              "left").drop("sentiment_label")

        factActi = (factActi.select(
            F.col("id"), col("time_key"), col("sentiment_key"), col("author_key"),col("subreddit_key"),col("postType_key"),
           col("score"),col("num_comments"),col("total_awards_received"),col("subreddit_subscribers"))
            .withColumnRenamed("id", "post_key")
            .withColumn("id", F.monotonically_increasing_id()))
        return factActi
        
    def createFactCommentActivity(self, dimTime, dimAuthor, dimSubreddit, dimPost, dimSentiment):
         
        dfCmt = self.dfComment.withColumn(
            "sentiment_label", sentimentUdf(F.col("body"))
        )

        
        dfCommentActi = (dfCmt.select(
            F.col("id"),col("author_fullname"),col("createdDate"),col("subreddit"),
            col("link_clean"),
            col("sentiment_label"),
            col("score"),
            col("controversiality"),
            col("total_awards_received")))

        factActi = dfCommentActi.join(dimTime,
            dimTime.time_key == dfCommentActi.createdDate,
            "left").drop("createdDate")
        factActi= factActi.join(dimAuthor,
            dimAuthor.author_key == factActi.author_fullname,
            "left").drop("author_fullname")
    
        factActi =factActi.join(dimSubreddit,
            dimSubreddit.subreddit == factActi.subreddit,
            "left").drop("subreddit")

        factActi =factActi.join(dimPost,
            dimPost.post_key == factActi.link_clean,
            "left").drop("link_clean")
        
        factActi = factActi.join(
            dimSentiment,
            dimSentiment.sentiment_label == factActi.sentiment_label,
            "left").drop("sentiment_label")
        

        factActi= (factActi.select(
            F.col("id"),
            F.col("time_key"),col("author_key"), col("sentiment_key"), col("subreddit_key"),
            col("post_key"),
            col("score"),col("controversiality"), col("total_awards_received"))
            .withColumnRenamed("id", "comment_key")
            .withColumn("id", F.monotonically_increasing_id()))

        return factActi


## 2. Spark Session

In [9]:

spark = SparkSession.builder.appName("Transformer").getOrCreate()

25/08/24 17:11:18 WARN Utils: Your hostname, Acer resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlp4s0)
25/08/24 17:11:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/24 17:11:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 3. Transform Submission Silver

In [10]:
pathSub = "../data/RStest.jsonl"
rSubmissionBronzeSchema = StructType([
    StructField("id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("selftext", StringType(), True),
    StructField("url", StringType(), True),
    StructField("permalink", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("post_hint", StringType(), True),
    StructField("author", StringType(), True),
    StructField("author_fullname", StringType(), True),
    StructField("created_utc", LongType(), True),
    StructField("subreddit", StringType(), True),
    StructField("subreddit_id", StringType(), True),
    StructField("subreddit_name_prefixed", StringType(), True),
    StructField("subreddit_type", StringType(), True),
    StructField("subreddit_subscribers", IntegerType(), True),
    StructField("score", IntegerType(), True),
    StructField("num_comments", IntegerType(), True),
    StructField("total_awards_received", IntegerType(), True),
    StructField("edited", BooleanType(), True),
    StructField("locked", BooleanType(), True),
    StructField("spoiler", BooleanType(), True),
    StructField("over_18", BooleanType(), True),
    StructField("stickied", BooleanType(), True),   
    StructField("retrieved_on", LongType(), True),
    StructField("is_original_content", BooleanType(), True),
    StructField("link_flair_text", StringType(), True)
])


In [11]:
submission = SubmissionTransformer(spark)

In [12]:
dfSub = submission.readData(pathSub, schema=rSubmissionBronzeSchema)

In [13]:
submission.showShape(dfSub)

(26, 5)

In [14]:
dfSub=submission.transform(dfSub)

In [15]:
submission.showShape(dfSub)

(31, 5)

In [16]:
dfSub.show()

25/08/24 17:11:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+---------------+---------------+-----------+-----------+------------+-----------------------+--------------+---------------------+-----+------------+---------------------+------+------+-------+-------+--------+------------+-------------------+---------------+-------------------+----------+-----------+------------------+----------+
|     id|               title|            selftext|                 url|           permalink|              domain|post_hint|         author|author_fullname|created_utc|  subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|subreddit_subscribers|score|num_comments|total_awards_received|edited|locked|spoiler|over_18|stickied|retrieved_on|is_original_content|link_flair_text|        createdDate|accDeleted|postDeleted|      matchPercent|isPostSpam|
+-------+--------------------+--------------------+--------------------+--------------

## 3. Transform Comment Silver

In [17]:
comment=CommentTransformer(spark)

In [18]:
pathCmt = "../data/RCtest.jsonl"

rCommentBronzeSchema = StructType([
    StructField("id", StringType(), True),
    StructField("body", StringType(), True),
    StructField("created_utc", LongType(), True),
    StructField("edited", BooleanType(), True),
    StructField("score", IntegerType(), True),
    StructField("author", StringType(), True),
    StructField("author_fullname", StringType(), True),
    StructField("author_created_utc", LongType(), True),
    StructField("parent_id", StringType(), True),
    StructField("link_id", StringType(), True),
    StructField("is_submitter", BooleanType(), True),
    StructField("permalink", StringType(), True),
    StructField("subreddit", StringType(), True),
    StructField("subreddit_id", StringType(), True),
    StructField("subreddit_name_prefixed", StringType(), True),
    StructField("subreddit_type", StringType(), True),
    StructField("total_awards_received", IntegerType(), True),
    StructField("controversiality", IntegerType(), True),
    StructField("retrieved_on", LongType(), True),
    StructField("stickied", BooleanType(), True)
])



In [19]:
dfCmt=comment.readData(pathCmt, schema=rCommentBronzeSchema)

In [20]:
comment.showShape(dfCmt)

(20, 10)

In [21]:
dfCmt=comment.transform(dfCmt)

In [22]:
comment.showShape(dfCmt)

(27, 10)

In [84]:
dfCmt.show()

+-------+--------------------+-----------+------+-----+-----------+---------------+------------------+----------+----------+------------+--------------------+-----------+------------+-----------------------+--------------+---------------------+----------------+------------+--------+-------------------+----------+----------+------------+----------+--------------+---------------+
|     id|                body|created_utc|edited|score|     author|author_fullname|author_created_utc| parent_id|   link_id|is_submitter|           permalink|  subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|total_awards_received|controversiality|retrieved_on|stickied|        createdDate|accDeleted|cmtDeleted|parent_clean|link_clean|deleted_by_mod|deleted_by_auto|
+-------+--------------------+-----------+------+-----+-----------+---------------+------------------+----------+----------+------------+--------------------+-----------+------------+-----------------------+--------------+----------------

## 4. Gold Transform DimTime DimAuthor test 

In [68]:
gold=GoldTransformer(spark, dfSub, dfCmt)


In [69]:
dfSub.show()

+-------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+---------------+---------------+-----------+-----------+------------+-----------------------+--------------+---------------------+-----+------------+---------------------+------+------+-------+-------+--------+------------+-------------------+---------------+-------------------+----------+-----------+------------------+----------+
|     id|               title|            selftext|                 url|           permalink|              domain|post_hint|         author|author_fullname|created_utc|  subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|subreddit_subscribers|score|num_comments|total_awards_received|edited|locked|spoiler|over_18|stickied|retrieved_on|is_original_content|link_flair_text|        createdDate|accDeleted|postDeleted|      matchPercent|isPostSpam|
+-------+--------------------+--------------------+--------------------+--------------

In [83]:
dfCmt.show()

+-------+--------------------+-----------+------+-----+-----------+---------------+------------------+----------+----------+------------+--------------------+-----------+------------+-----------------------+--------------+---------------------+----------------+------------+--------+-------------------+----------+----------+------------+----------+--------------+---------------+
|     id|                body|created_utc|edited|score|     author|author_fullname|author_created_utc| parent_id|   link_id|is_submitter|           permalink|  subreddit|subreddit_id|subreddit_name_prefixed|subreddit_type|total_awards_received|controversiality|retrieved_on|stickied|        createdDate|accDeleted|cmtDeleted|parent_clean|link_clean|deleted_by_mod|deleted_by_auto|
+-------+--------------------+-----------+------+-----+-----------+---------------+------------------+----------+----------+------------+--------------------+-----------+------------+-----------------------+--------------+----------------

In [71]:

dimTime = gold.createDimTime()
dimAuthor = gold.createDimAuthor()
dimSentiment = gold.createDimSentiment()
dimSubreddit = gold.createDimSubreddit()
dimPostType = gold.createDimPostType()
dimPost = gold.createDimPost()
dimComment = gold.createDimComment()


In [72]:
dimTime.show()

+-------------------+----+-----+---+----+------+------+-----------+
|           time_key|year|month|day|hour|minute|second|day_of_week|
+-------------------+----+-----+---+----+------+------+-----------+
|2025-06-01 07:07:25|2025|    6|  1|   7|     7|    25|          1|
|2025-06-01 07:25:33|2025|    6|  1|   7|    25|    33|          1|
|2025-06-01 07:27:41|2025|    6|  1|   7|    27|    41|          1|
|2025-06-01 07:27:18|2025|    6|  1|   7|    27|    18|          1|
|2025-06-01 07:30:13|2025|    6|  1|   7|    30|    13|          1|
|2025-06-01 07:41:10|2025|    6|  1|   7|    41|    10|          1|
|2025-06-01 07:31:13|2025|    6|  1|   7|    31|    13|          1|
|2025-06-01 07:27:29|2025|    6|  1|   7|    27|    29|          1|
|2025-06-01 07:45:53|2025|    6|  1|   7|    45|    53|          1|
|2025-06-01 07:36:00|2025|    6|  1|   7|    36|     0|          1|
|2025-06-01 07:40:59|2025|    6|  1|   7|    40|    59|          1|
|2025-06-01 07:26:19|2025|    6|  1|   7|    26|

In [73]:
dimAuthor.show()

+---------------+------------+
|         author|  author_key|
+---------------+------------+
|       Gari_305| t2_65fa26pr|
|Present_Fill412|t2_p5klht0fl|
|      katxwoods| t2_6h6wd5oz|
|  Businessmarck| t2_8mw6j6fe|
|       Black_RL|    t2_c0v9b|
|           dE3L|    t2_4c4pg|
|    Lord_Genius| t2_8ea9dgct|
|     S7ageNinja| t2_245empqs|
|      [deleted]|        NULL|
+---------------+------------+



In [74]:
dimSentiment.show()

+---------------+-------------+
|sentiment_label|sentiment_key|
+---------------+-------------+
|       negative|  25769803776|
|       positive|  60129542144|
|        neutral|  94489280512|
+---------------+-------------+



In [75]:
dimPostType.show()

+---------+------------+
|post_hint|postType_key|
+---------+------------+
|     link|           0|
|     NULL|           1|
+---------+------------+



In [76]:
dimSubreddit.show()

+-------------+-----------+-----------------------+--------------+
|subreddit_key|  subreddit|subreddit_name_prefixed|subreddit_type|
+-------------+-----------+-----------------------+--------------+
|     t5_2t7no| Futurology|           r/Futurology|        public|
|     t5_2qh16| technology|           r/technology|        public|
|      t5_2fwo|programming|          r/programming|        public|
+-------------+-----------+-----------------------+--------------+



In [77]:
dimComment.show()

+-----------+--------------------+--------------------+------+------------+----------------+
|comment_key|                body|           permalink|edited|is_submitter|controversiality|
+-----------+--------------------+--------------------+------+------------+----------------+
|    mvc2u2o|From the article\...|/r/Futurology/com...| false|        true|               0|
|    mvc30mg|Submission statem...|/r/Futurology/com...| false|        true|               1|
|    mvc3ikx|           [removed]|/r/Futurology/com...| false|       false|               0|
|    mvc3lwv|           [removed]|/r/Futurology/com...| false|       false|               0|
|    mvc3uix|           [removed]|/r/programming/co...| false|       false|               0|
|    mvc4ct7|We also shouldn’t...|/r/Futurology/com...| false|       false|               1|
|    mvc4f29|You have to be co...|/r/Futurology/com...| false|       false|               0|
|    mvc54or|I’d rather debate...|/r/Futurology/com...| false|       f

In [78]:
dimPost.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+------+------+-------+-------+--------+-------------------+---------------+----------+----------+------------------+-----------+-------------+
|post_key|               title|                body|           permalink|                 url|              domain|edited|locked|spoiler|over_18|stickied|is_original_content|link_flair_text|accDeleted|isPostSpam|      matchPercent|postDeleted|   postStatus|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+------+------+-------+-------+--------+-------------------+---------------+----------+----------+------------------+-----------+-------------+
| 1l0bhr6|            Someone?|           [removed]|/r/technology/com...|                    |                    | false| false|  false|  false|   false|              false|       Business|      true|     false|              

In [79]:

factPostActi = gold.createFactPostActivity(dimTime, dimAuthor, dimSubreddit, dimPostType, dimSentiment)
factCmtActi = gold. createFactCommentActivity(dimTime, dimAuthor, dimSubreddit, dimPost, dimSentiment)

In [80]:
factPostActi.show()

Loaded as API: https://dataguychill-sentiment.hf.space ✔            (0 + 1) / 1]
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
                                                                                

+--------+-------------------+-------------+------------+-------------+------------+-----+------------+---------------------+---------------------+---+
|post_key|           time_key|sentiment_key|  author_key|subreddit_key|postType_key|score|num_comments|total_awards_received|subreddit_subscribers| id|
+--------+-------------------+-------------+------------+-------------+------------+-----+------------+---------------------+---------------------+---+
| 1l0bhr6|2025-06-01 07:07:25|         NULL|        NULL|     t5_2qh16|        NULL|    1|           0|                    0|             19349025|  0|
| 1l0buai|2025-06-01 07:25:33|  60129542144| t2_65fa26pr|     t5_2t7no|           0|    0|           9|                    0|             21575272|  1|
| 1l0bvir|2025-06-01 07:27:18|  60129542144| t2_6h6wd5oz|     t5_2t7no|        NULL|  675|          45|                    0|             21575272|  2|
| 1l0bvr5|2025-06-01 07:27:41|  94489280512| t2_8mw6j6fe|     t5_2t7no|        NULL|    

In [81]:
factCmtActi.show()

Loaded as API: https://dataguychill-sentiment.hf.space ✔            (0 + 1) / 1]
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔
Loaded as API: https://dataguychill-sentiment.hf.space ✔


+-----------+-------------------+-----------+-------------+-------------+--------+-----+----------------+---------------------+---+
|comment_key|           time_key| author_key|sentiment_key|subreddit_key|post_key|score|controversiality|total_awards_received| id|
+-----------+-------------------+-----------+-------------+-------------+--------+-----+----------------+---------------------+---+
|    mvc2u2o|2025-06-01 07:26:19|t2_65fa26pr|  60129542144|     t5_2t7no| 1l0buai|    0|               0|                    0|  0|
|    mvc30mg|2025-06-01 07:27:29|t2_6h6wd5oz|  60129542144|     t5_2t7no| 1l0bvir|    2|               1|                    0|  1|
|    mvc3ikx|2025-06-01 07:30:38|       NULL|  60129542144|     t5_2t7no| 1l0bvir|    1|               0|                    0|  2|
|    mvc3lwv|2025-06-01 07:31:13|       NULL|  60129542144|     t5_2t7no| 1l0bvr5|    1|               0|                    0|  3|
|    mvc3uix|2025-06-01 07:32:45|       NULL|  60129542144|      t5_2fwo|   

                                                                                

In [82]:

dff.groupBy("subreddit_id") \
   .count() \
   .orderBy(F.col("count").desc()) \
   .show(200)

NameError: name 'dff' is not defined