In [1]:
import findspark
findspark.init()

In [2]:
# Import required libraries
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys

In [3]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Sentiment Calculation Pandas UDF") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "3") \
    .config("spark.dynamicAllocation.maxExecutors", "9") \
    .config("spark.dynamicAllocation.initialExecutors", "3") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraPythonPackages", "pandas,vaderSentiment") \
    .config("spark.driver.extraPythonPackages", "pandas,vaderSentiment") \
    .config("spark.executorEnv.PYTHONPATH", ":".join(sys.path)) \
    .getOrCreate()

# Set log level for cleaner outputs
sc = spark.sparkContext
sc.setLogLevel("INFO")


In [4]:
# HDFS path for the dataset
file_path = "hdfs://namenode:9000/data/cleaned_dataset.parquet"

# Read the Parquet file into a DataFrame
df = spark.read.parquet(file_path)

# Display schema and sample rows
df.printSchema()
df.show(5, truncate=False)

09:29:42.015 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
09:29:42.022 [Thread-4] INFO  org.apache.spark.sql.internal.SharedState - Warehouse path is 'file:/home/ubuntu/project/cluster-notebooks/spark-warehouse'.
09:29:42.035 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
09:29:42.038 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@7f98dda2{/SQL,null,AVAILABLE,@Spark}
09:29:42.038 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - Adding filter to /SQL/json: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
09:29:42.039 [Thread-4] INFO  org.sparkproject.jetty.server.handler.ContextHandler - Started o.s.j.s.ServletContextHandler@63231786{/SQL/json,null,AVAILABLE,@Spark}
09:29:42.040 [Thread-4] INFO  org.apache.spark.ui.ServerInfo - A

[Stage 0:>                                                          (0 + 1) / 1]

09:29:45.350 [task-result-getter-0] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 0.0 (TID 0) in 1904 ms on datanode2 (executor 3) (1/1)
09:29:45.352 [task-result-getter-0] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 0.0, whose tasks have all completed, from pool 
09:29:45.360 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 0 (parquet at NativeMethodAccessorImpl.java:0) finished in 2.024 s
09:29:45.364 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
09:29:45.364 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 0: Stage finished
09:29:45.366 [Thread-4] INFO  org.apache.spark.scheduler.DAGScheduler - Job 0 finished: parquet at NativeMethodAccessorImpl.java:0, took 2.087515 s


                                                                                

09:29:45.572 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on namenode:38767 in memory (size: 47.0 KiB, free: 2004.6 MiB)
09:29:45.593 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_0_piece0 on datanode2:35035 in memory (size: 47.0 KiB, free: 912.3 MiB)
root
 |-- link: string (nullable = true)
 |-- date: long (nullable = true)
 |-- score: long (nullable = true)
 |-- sub_reddit: string (nullable = true)
 |-- post_id: string (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- body: string (nullable = true)

09:29:46.376 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
09:29:46.377 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
09:29:46.733 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 182.217068 ms
09:29:4

[Stage 1:>                                                          (0 + 1) / 1]

09:29:47.832 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_1_piece0 in memory on datanode3:40289 (size: 45.1 KiB, free: 912.2 MiB)
09:29:49.280 [task-result-getter-1] INFO  org.apache.spark.scheduler.TaskSetManager - Finished task 0.0 in stage 1.0 (TID 1) in 2410 ms on datanode3 (executor 2) (1/1)
09:29:49.280 [task-result-getter-1] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Removed TaskSet 1.0, whose tasks have all completed, from pool 
09:29:49.282 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - ResultStage 1 (showString at NativeMethodAccessorImpl.java:0) finished in 2.445 s
09:29:49.282 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.DAGScheduler - Job 1 is finished. Cancelling potential speculative or zombie tasks for this job
09:29:49.282 [dag-scheduler-event-loop] INFO  org.apache.spark.scheduler.cluster.YarnScheduler - Killing all running tasks in stage 1: Stage finished
09:

                                                                                

09:29:49.981 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_2_piece0 on namenode:38767 in memory (size: 6.9 KiB, free: 2004.6 MiB)
09:29:49.992 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Removed broadcast_2_piece0 on datanode3:40289 in memory (size: 6.9 KiB, free: 912.3 MiB)
09:29:50.091 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 16.863731 ms
+----------------------------------------------------------------------------------------------------------------+----------+-----+---------------------+-------+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
# Function to clean comments
def clean_comment_spark(df, column):
    """Clean comments in the specified column."""
    return df.withColumn(
        f"{column}_clean",
        F.trim(
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.lower(F.col(column)),  # Convert to lowercase
                        r"http\S+|www\S+|https\S+", ""),  # Remove URLs
                    r"@\w+|#", ""),  # Remove mentions and hashtags
                r"[^\w\s]", ""),  # Remove special characters and punctuation
            )
        )


In [6]:
# Clean the comments and select relevant columns
df = clean_comment_spark(df, "body").select("comment_id", "body_clean")
df.show(5, truncate=False)

09:29:50.273 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Pushed Filters: 
09:29:50.273 [Thread-4] INFO  org.apache.spark.sql.execution.datasources.FileSourceStrategy - Post-Scan Filters: 
09:29:50.325 [Thread-4] INFO  org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator - Code generated in 35.398896 ms
09:29:50.329 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3 stored as values in memory (estimated size 408.3 KiB, free 2003.8 MiB)
09:29:50.336 [Thread-4] INFO  org.apache.spark.storage.memory.MemoryStore - Block broadcast_3_piece0 stored as bytes in memory (estimated size 44.9 KiB, free 2003.7 MiB)
09:29:50.336 [dispatcher-BlockManagerMaster] INFO  org.apache.spark.storage.BlockManagerInfo - Added broadcast_3_piece0 in memory on namenode:38767 (size: 44.9 KiB, free: 2004.5 MiB)
09:29:50.337 [Thread-4] INFO  org.apache.spark.SparkContext - Created broadcast 3 from showString at NativeMethodAccessorImpl.java

In [7]:
# TODO: pip install pyspark-pandas!

# Define a Pandas UDF for sentiment analysis
@pandas_udf("float", PandasUDFType.SCALAR)
def calculate_sentiment_udf(body_clean):
    """Calculate sentiment score using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    return body_clean.apply(lambda text: analyzer.polarity_scores(text)['compound'] if text else None)

ImportError: Pandas >= 1.0.5 must be installed; however, it was not found.

In [None]:
# Apply the UDF to calculate sentiment
df = df.withColumn("sentiment", calculate_sentiment_udf(F.col("body_clean")))

# Show a sample of the results
df.show(10, truncate=False)

In [None]:
# Write the results to HDFS in Parquet format
output_path = "hdfs://namenode:9000/data/results/sentiment_calculations.parquet"
df.write.mode("overwrite").parquet(output_path)
print(f"Results written to {output_path}")

In [None]:
# Read back the results and display a preview
result_df = spark.read.parquet(output_path)
result_df.show(10, truncate=False)
result_count = result_df.count()
print(f"Total records processed: {result_count}")

In [None]:
sc.stop()