In [1]:
import os
import shutil
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, LongType

# Spark MLlib Imports
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize Spark Session (Local Config)
# We use local[*] to use all cores, and ensure enough memory for caching.
spark = SparkSession.builder \
    .appName("Reddit_Comments_Model_Local") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "100") \
    .config("spark.default.parallelism", "100") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")

# Define paths
DATA_DIR = "data_cleaned"
MODELS_DIR = "models"

print("‚úÖ Setup Complete.")

Spark Version: 3.5.0
‚úÖ Setup Complete.


In [2]:
# Load all comments data
comments_path = f"{DATA_DIR}/*_comments.csv"

print("Loading comments with multiLine support...")

# 1. Read with multiLine enabled to handle newlines in comments
df_raw = spark.read \
    .option("header", True) \
    .option("inferSchema", False) \
    .option("multiLine", True) \
    .option("escape", "\"") \
    .csv(comments_path)

# 2. Define Valid Subreddits (The Firewall)
valid_subreddits = [
    'Conservative', 'Libertarian', 'PoliticalDiscussion', 
    'neutralnews', 'politics', 'socialism', 'worldnews'
]

# 3. Strict Cleaning & Filtering
# We filter out deleted comments, short comments, and rows where columns shifted.
df_clean = df_raw.filter(
    (F.col("body").isNotNull()) & 
    (F.col("subreddit").isin(valid_subreddits)) & # Discards garbage rows
    (F.col("body") != "[deleted]") & 
    (F.col("body") != "[removed]") &
    (F.length(F.col("body")) > 15) # Keep only substantial comments (>15 chars)
).select(
    F.col("body").alias("text"), # Rename 'body' to 'text' for the pipeline
    F.col("subreddit")
)

print("‚úÖ Data loaded and cleaned.")

Loading comments with multiLine support...
‚úÖ Data loaded and cleaned.


In [3]:
# 1. Calculate counts to see imbalance
print("Original counts per subreddit:")
df_clean.groupBy("subreddit").count().show()

# 2. Define target sample size per class
# 40k is a sweet spot: 40k * 7 = 280k rows. 
# This fits easily in RAM but is "Big Data" enough for a good model.
TARGET_PER_CLASS = 40000

# 3. Stratified Sampling
# We loop through each subreddit and take a random sample.
sampled_dfs = []
for sub in valid_subreddits:
    # Filter for specific subreddit
    sub_df = df_clean.filter(F.col("subreddit") == sub)
    
    # Calculate fraction needed to get TARGET_PER_CLASS
    count = sub_df.count()
    if count > TARGET_PER_CLASS:
        fraction = TARGET_PER_CLASS / count
        sampled_dfs.append(sub_df.sample(withReplacement=False, fraction=fraction, seed=42))
    else:
        # If less than target, take all of them
        sampled_dfs.append(sub_df)

# 4. Union all samples together
from functools import reduce
from pyspark.sql import DataFrame
df_balanced = reduce(DataFrame.union, sampled_dfs)

# 5. CACHE THE RESULT
# This is the critical step. We load this clean, balanced dataset into RAM.
df_balanced.cache()
print(f"Materializing balanced dataset into RAM...")
total_count = df_balanced.count()
print(f"‚úÖ Final Training Set Size: {total_count:,} rows")

print("Balanced counts:")
df_balanced.groupBy("subreddit").count().show()

Original counts per subreddit:
+-------------------+------+
|          subreddit| count|
+-------------------+------+
|           politics|695286|
|          worldnews|233914|
|       Conservative| 44041|
|          socialism| 14494|
|        Libertarian| 16471|
|PoliticalDiscussion| 10699|
|        neutralnews|   686|
+-------------------+------+

Materializing balanced dataset into RAM...
‚úÖ Final Training Set Size: 162,509 rows
Balanced counts:
+-------------------+-----+
|          subreddit|count|
+-------------------+-----+
|       Conservative|39961|
|        Libertarian|16471|
|PoliticalDiscussion|10699|
|        neutralnews|  686|
|           politics|39935|
|          socialism|14494|
|          worldnews|40263|
+-------------------+-----+



In [4]:
print("--- Training Logistic Regression on Comments ---")

# 1. Split Data
train_data, test_data = df_balanced.randomSplit([0.8, 0.2], seed=42)

# 2. Define Pipeline Stages
# StringIndexer: Convert "politics" -> 0
label_indexer = StringIndexer(inputCol="subreddit", outputCol="label", handleInvalid="skip")

# Tokenizer: Split text into words
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# StopWords: Remove "the", "and", etc.
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# HashingTF: Convert words to numbers. 
# We use 10,000 features. Since we have 280k rows cached, this should fit in 4GB RAM.
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)

# IDF: Weight rare words higher
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Classifier
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# LabelConverter: Convert 0 -> "politics"
# We need to fit the indexer first to get the labels for the converter
indexer_model = label_indexer.fit(train_data)
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_subreddit", labels=indexer_model.labels)

# Build the Pipeline
pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, hashingTF, idf, lr, label_converter])

# 3. Train
print("Training model... (This might take 1-2 minutes)")
model = pipeline.fit(train_data)
print("‚úÖ Training Complete.")

--- Training Logistic Regression on Comments ---
Training model... (This might take 1-2 minutes)
‚úÖ Training Complete.


In [5]:
# 1. Evaluate
print("Evaluating on Test Set...")
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"‚úÖ Comments Model Accuracy: {accuracy:.4f}")

# 2. Save Model
model_save_path = f"{MODELS_DIR}/spark_lr_comments_model"

# Clean up old model if exists
if os.path.exists(model_save_path):
    shutil.rmtree(model_save_path)

model.save(model_save_path)
print(f"üíæ Model saved to: {model_save_path}")

Evaluating on Test Set...
‚úÖ Comments Model Accuracy: 0.4343
üíæ Model saved to: models/spark_lr_comments_model


In [6]:
print("--- Verifying Model Loading ---")

try:
    loaded_model = PipelineModel.load(model_save_path)
    print("‚úÖ Successfully loaded Comments model.")
    
    # Test a sample comment
    # Note: The column name must be 'text' because that's what we trained on!
    sample_text = [("I think the government spending is getting out of control.",)]
    sample_df = spark.createDataFrame(sample_text, ["text"])
    
    # Add dummy subreddit col for StringIndexer
    sample_df = sample_df.withColumn("subreddit", F.lit("politics"))
    
    result = loaded_model.transform(sample_df)
    prediction = result.select("predicted_subreddit").first()[0]
    print(f"   Test Prediction: {prediction}")
    
except Exception as e:
    print(f"‚ùå Failed to load model: {e}")

--- Verifying Model Loading ---
‚úÖ Successfully loaded Comments model.
   Test Prediction: worldnews
