In [24]:
import time
from pathlib import Path
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, LongType, IntegerType, DoubleType, BooleanType
import os
import logging
import warnings

# Suppress Python warnings
warnings.filterwarnings("ignore")

# Suppress Java/Spark logging
logging.getLogger("py4j").setLevel(logging.ERROR)
logging.getLogger("pyspark").setLevel(logging.ERROR)
logging.getLogger("org.apache.spark").setLevel(logging.ERROR)
logging.getLogger("org.apache.hadoop").setLevel(logging.ERROR)

# Also suppress Spark context logs
spark.sparkContext.setLogLevel("ERROR")

# Set JAVA_HOME to the Homebrew OpenJDK installation
os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Java 17+ compatibility flags for Spark
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    "--conf spark.driver.extraJavaOptions='--add-opens java.base/javax.security.auth=ALL-UNNAMED "
    "--add-opens java.base/java.lang=ALL-UNNAMED "
    "--add-opens java.base/java.lang.invoke=ALL-UNNAMED "
    "--add-opens java.base/java.lang.reflect=ALL-UNNAMED "
    "--add-opens java.base/java.io=ALL-UNNAMED "
    "--add-opens java.base/java.net=ALL-UNNAMED "
    "--add-opens java.base/java.nio=ALL-UNNAMED "
    "--add-opens java.base/java.util=ALL-UNNAMED "
    "--add-opens java.base/java.util.concurrent=ALL-UNNAMED "
    "--add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED "
    "--add-opens java.base/sun.nio.ch=ALL-UNNAMED "
    "--add-opens java.base/sun.nio.cs=ALL-UNNAMED "
    "--add-opens java.base/sun.security.action=ALL-UNNAMED "
    "--add-opens java.base/sun.util.calendar=ALL-UNNAMED ' "
    "--conf spark.executor.extraJavaOptions='--add-opens java.base/javax.security.auth=ALL-UNNAMED "
    "--add-opens java.base/java.lang=ALL-UNNAMED "
    "--add-opens java.base/java.lang.invoke=ALL-UNNAMED "
    "--add-opens java.base/java.lang.reflect=ALL-UNNAMED "
    "--add-opens java.base/java.io=ALL-UNNAMED "
    "--add-opens java.base/java.net=ALL-UNNAMED "
    "--add-opens java.base/java.nio=ALL-UNNAMED "
    "--add-opens java.base/java.util=ALL-UNNAMED "
    "--add-opens java.base/java.util.concurrent=ALL-UNNAMED "
    "--add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED "
    "--add-opens java.base/sun.nio.ch=ALL-UNNAMED "
    "--add-opens java.base/sun.nio.cs=ALL-UNNAMED "
    "--add-opens java.base/sun.security.action=ALL-UNNAMED "
    "--add-opens java.base/sun.util.calendar=ALL-UNNAMED ' pyspark-shell"
)


In [18]:
# Verify Java
os.system("java -version")

openjdk version "17.0.17" 2025-10-21
OpenJDK Runtime Environment Homebrew (build 17.0.17+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.17+0, mixed mode, sharing)


0

In [None]:
print("Initializing local Spark session...")
spark = SparkSession.builder \
    .appName("RedditEchoChamberLocal") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()
sc = spark.sparkContext
print(f"Spark session initialized. Running Spark version {sc.version}")

# Also suppress Spark context logs
spark.sparkContext.setLogLevel("ERROR")

Initializing local Spark session...
Spark session initialized. Running Spark version 4.0.1


In [20]:
spark.sparkContext._jvm.java.lang.System.getProperty("java.version")

'17.0.17'

In [21]:
# Define the local paths to your data
# The script assumes your 'data_cleaned' folder is in the same directory
data_dir = Path("./data_cleaned")
posts_path = str(data_dir / "*_posts.csv")
comments_path = str(data_dir / "*_comments.csv")

print(f"\nLoading posts from: {posts_path}")
print(f"Loading comments from: {comments_path}")

# Load the posts and comments into Spark DataFrames
# The wildcard (*) tells Spark to load all matching files into one DataFrame
try:
    posts_df = spark.read.csv(posts_path, header=True, inferSchema=True)
    comments_df = spark.read.csv(comments_path, header=True, inferSchema=True)
except Exception as e:
    print(f"Error loading data: {e}")
    raise

# Cache the DataFrames in memory for faster access
posts_df.cache()
comments_df.cache()

# Verify the data is loaded correctly
print("\nPosts DataFrame Schema and Count:")
posts_df.printSchema()
print(f"Total posts: {posts_df.count():,}")

print("\nComments DataFrame Schema and Count:")
comments_df.printSchema()
print(f"Total comments: {comments_df.count():,}")


Loading posts from: data_cleaned/*_posts.csv
Loading comments from: data_cleaned/*_comments.csv





Posts DataFrame Schema and Count:
root
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- url: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- is_self: string (nullable = true)
 |-- score: string (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- upvote_ratio: string (nullable = true)
 |-- author: string (nullable = true)
 |-- stickied: string (nullable = true)
 |-- over_18: string (nullable = true)
 |-- permalink: string (nullable = true)

Total posts: 35,551

Comments DataFrame Schema and Count:
root
 |-- id: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- body: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: string (nullable = true)
 |-- controversiality: string (null

                                                                                

In [30]:
print("\n--- Performing 5+ Spark Transformations (Production-Ready Method) ---")

# --- Data Loading with NO Schema Inference ---
# Read all columns as strings to prevent any schema errors.
posts_df_string = spark.read.csv(posts_path, header=True, inferSchema=False)

# --- Explicit and Safe Casting using try_cast ---
# This is the most robust way to handle dirty data.
# We use F.expr() to invoke the powerful try_cast SQL function.
posts_transformed_df = posts_df_string.withColumn(
    # Safely cast created_utc to a number, then convert to timestamp
    "timestamp", F.to_timestamp(F.from_unixtime(F.expr("try_cast(created_utc AS LONG)")))
).withColumn(
    "score", F.expr("try_cast(score AS INT)")
).withColumn(
    "num_comments", F.expr("try_cast(num_comments AS INT)")
)


# 2. select: Create a more focused DataFrame
focused_posts_df = posts_transformed_df.select("subreddit", "author", "title", "score", "num_comments", "timestamp")

# 3. filter: Find all posts by a specific, known active author
print("\nFound posts by author 'hety0p':")
author_posts_df = focused_posts_df.filter(F.col("author") == "hety0p")
author_posts_df.show(5, truncate=False)

# 4. groupBy & agg: Find the average score and total comments per subreddit
# The agg functions (avg, sum) will automatically ignore the NULLs created by try_cast.
subreddit_stats_df = focused_posts_df.groupBy("subreddit").agg(
    F.avg("score").alias("avg_score"),
    F.sum("num_comments").alias("total_comments")
)

# 5. orderBy: Show the subreddits with the highest average score
print("\nSubreddit stats, ordered by average score:")
subreddit_stats_df.orderBy(F.col("avg_score").desc()).show()


--- Performing 5+ Spark Transformations (Production-Ready Method) ---

Found posts by author 'hety0p':
+---------+------+-----------------------------------------------------------------------------------------------+-----+------------+-------------------+
|subreddit|author|title                                                                                          |score|num_comments|timestamp          |
+---------+------+-----------------------------------------------------------------------------------------------+-----+------------+-------------------+
|politics |hety0p|Rubio will meet Mexico's president as Trump flexes military might in Latin America             |4    |3           |2025-09-03 11:52:02|
|politics |hety0p|Nancy Mace Fires Back at Claim She Hijacked Epstein Survivors’ Meeting                         |1    |0           |2025-09-03 12:15:52|
|politics |hety0p|If Trump loses his tariff lawsuit, America may have to refund businesses more than $200 billion|1    |0     

In [33]:
print("\n--- Performance Comparison: Average Score per Subreddit ---")

# --- Spark Performance ---
print("Running with PySpark...")
start_time_spark = time.time()

# THE FIX: Use the cleaned 'posts_transformed_df' instead of the raw 'posts_df'
# The 'score' column in this DataFrame has already been safely cast.
posts_transformed_df.groupBy("subreddit").agg(F.avg("score")).collect()

end_time_spark = time.time()
print(f"PySpark execution time: {end_time_spark - start_time_spark:.4f} seconds")

# --- Pandas Performance ---
print("\nRunning with Pandas...")
start_time_pandas = time.time()
# For a fair comparison, load all post CSVs into a single Pandas DataFrame
all_post_files = list(data_dir.glob("*_posts.csv"))
pd_df_list = [pd.read_csv(f) for f in all_post_files]
pd_posts_df = pd.concat(pd_df_list, ignore_index=True)

# Add a similar safe casting for Pandas to avoid errors and ensure a fair comparison
pd_posts_df['score'] = pd.to_numeric(pd_posts_df['score'], errors='coerce')
pd_posts_df.groupby("subreddit")["score"].mean()

end_time_pandas = time.time()
print(f"Pandas execution time: {end_time_pandas - start_time_pandas:.4f} seconds")

print("\nNote: For small datasets (< several GBs), Pandas is often faster due to Spark's overhead.")
print("The power of Spark becomes evident at big data scales.")


--- Performance Comparison: Average Score per Subreddit ---
Running with PySpark...
PySpark execution time: 0.3318 seconds

Running with Pandas...
Pandas execution time: 0.2027 seconds

Note: For small datasets (< several GBs), Pandas is often faster due to Spark's overhead.
The power of Spark becomes evident at big data scales.


In [34]:
spark.stop()
print("\n✅ Spark session stopped. Script finished.")


✅ Spark session stopped. Script finished.
