In [19]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import explode, split, col, array_contains, udf, expr
import matplotlib.pyplot as plt
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
import re
from pyspark.ml.regression import LinearRegression
import pandas as pd



In [20]:
spark = SparkSession.builder.appName("reddit_regression").getOrCreate()

In [21]:
reddit = spark.read.parquet("gs://msca-bdp-student-gcs/Group2_Final_Project/reddit_data/",header=True, inferSchema=True)

                                                                                

In [22]:
reddit = reddit.dropna()
reddit = reddit.select(['subreddit_name_prefixed', 'body', 'score'])
reddit = reddit.withColumnRenamed('subreddit_name_prefixed', 'subreddit')
reddit = reddit.withColumn("score", col("score").cast(IntegerType()))

In [23]:
def clean_text(text):
    # Deal with component words
    re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove Http / Https links in the text
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Handling repeated characters (more than 2)
    text = re.sub(r'(.)\1+', r'\1\1', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [24]:
clean_text_udf = udf(clean_text, StringType())
filtered_reddit = reddit.withColumn('body_cleaned', clean_text_udf(reddit['body']))

In [25]:
# Tokenization
tokenizer = Tokenizer(inputCol="body_cleaned", outputCol="words")

# Stopwords Removal
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Vectorization methods
cv = CountVectorizer(inputCol="filtered", outputCol="cv_features")
#hashingTF = HashingTF(inputCol="filtered", outputCol="hashingTF_features")
idf = IDF(inputCol="cv_features", outputCol="tfidf_features")

# Pipeline
vec_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf])

In [26]:
reddit_vec = vec_pipeline.fit(filtered_reddit).transform(filtered_reddit)
reddit_vec = reddit_vec.dropna()
reddit_vec.show()

23/11/27 12:21:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/11/27 12:23:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.6 MiB
23/11/27 12:23:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/11/27 12:23:29 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/27 12:23:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB

+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|subreddit|                body|score|        body_cleaned|               words|            filtered|         cv_features|      tfidf_features|
+---------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    r/DIY|If this is a slid...|    5|if this is a slid...|[if, this, is, a,...|[sliding, door, c...|(262144,[112,165,...|(262144,[112,165,...|
|    r/DIY|###Please read th...|    1|please read this ...|[please, read, th...|[please, read, en...|(262144,[8,27,30,...|(262144,[8,27,30,...|
|    r/DIY|You're assumption...|    0|youre assumption ...|[youre, assumptio...|[youre, assumptio...|(262144,[14,1920,...|(262144,[14,1920,...|
|    r/DIY|It’s not just tha...|    6|its not just that...|[its, not, just, ...|[also, cost, k, p...|(262144,[0,6,13,1...|(262144,[0,6,1

                                                                                

In [None]:
train_df, test_df = reddit_vec.randomSplit([0.8, 0.2], 2023)
ideal_subreddit_list = [row['subreddit'] for row in reddit_vec.select(col('subreddit')).distinct().collect()]
subreddit_list = ['r/books', 'r/personalfinance', 'r/programming', 'r/relationship_advice', 'r/mildlyinteresting', 'r/YouShouldKnow', 'r/LifeProTips']
lr = LinearRegression(featuresCol="tfidf_features", labelCol="score")
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="score")

schema = StructType([
        StructField("subreddit", StringType(), True),
        StructField("Root Mean Squared Error", FloatType(), True),
        StructField("Mean Squared Error", FloatType(), True),
        StructField("Mean Absolute Error", FloatType(), True),
        StructField("R-squared", FloatType(), True)
    ])

output_table = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)


for subreddit in subreddit_list:
    train_data = train_df.filter(col('subreddit') == subreddit)
    test_data = test_df.filter(col('subreddit') == subreddit)
    
    lrModel = lr.fit(train_data)
    predictions = lrModel.transform(test_data)
    
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
    
    new_rows = [Row(subreddit, rmse, mse, mae, r2)]
    new_rows_df = spark.createDataFrame(new_rows, output_table.schema)
    output_table = output_table.union(new_rows_df)

23/11/27 12:23:35 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/27 12:26:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.8 MiB
23/11/27 12:26:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/27 12:32:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/27 12:32:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/27 14:06:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.2 MiB
23/11/27 15:27:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.8 MiB
23/11/27 15:27:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.8 MiB
23/11/27 15:31:01 WARN org.apache.spark.deploy.yarn.YarnAllocator: Cannot find executorId for co

In [None]:
output_table.show()

+--------------------+-----------------------+------------------+-------------------+------------+
|           subreddit|Root Mean Squared Error|Mean Squared Error|Mean Absolute Error|   R-squared|
+--------------------+-----------------------+------------------+-------------------+------------+
|             r/books|              103.39324|         10690.162|          22.261074| -0.11800703|
|   r/personalfinance|               91.91888|          8449.079|           16.93279|-0.041326847|
|       r/programming|               58.76984|         3453.8943|          22.654121| -0.35712686|
|r/relationship_ad...|               96.04032|          9223.742|          19.174639| -0.01173983|
| r/mildlyinteresting|              231.99585|         53822.074|          31.362856|-0.028979968|
|     r/YouShouldKnow|              157.15237|          24696.87|           44.72835|  -0.3601451|
|       r/LifeProTips|                186.564|         34806.125|          31.248205| -0.08126477|
+---------

In [None]:
output_table_pd = output_table.toPandas()
output_path = 'gs://msca-bdp-student-gcs/Group2_Final_Project/model_metric/reddit_regression_metric.csv'
output_table_pd.to_csv(output_path, index = False)