In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import explode, split, col, array_contains, udf, expr
import matplotlib.pyplot as plt
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
import re
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName("youtube_regression").getOrCreate()

In [3]:
youtube = spark.read.option("quote", "\"").option("escape", "\"").option("multiLine", True).option("ignoreLeadingWhiteSpace", True).csv('gs://msca-bdp-student-gcs/Group2_Final_Project/Copy_of_youtube_comments.csv', header=True, inferSchema=True)

                                                                                

In [4]:
youtube = youtube.dropna()
youtube = youtube.withColumn("Likes", col("Likes").cast(IntegerType()))
youtube = youtube.withColumn("tag", expr("substr(tag, 10, length(tag) - 13)"))

In [5]:
youtube.show()

+--------------------+--------------------+-----+----------------+--------------------+-----+--------------------+------------------+--------------------+-------------------+--------------------+-----+
|         Video Title|           Video URL|Views|Publication Date|         Description|Likes|         Channel URL|      Channel Name|       Channel Image|Channel Subscribers|            Comments|  tag|
+--------------------+--------------------+-----+----------------+--------------------+-----+--------------------+------------------+--------------------+-------------------+--------------------+-----+
|SELLING Comic Boo...|https://www.youtu...|  574|     4 hours ago|Please like this ...|   74|https://www.youtu...|StickyGoose Comics|https://yt3.ggpht...|              7.93K|It’s cuts both wa...|Books|
|SELLING Comic Boo...|https://www.youtu...|  574|     4 hours ago|Please like this ...|   74|https://www.youtu...|StickyGoose Comics|https://yt3.ggpht...|              7.93K|As a small time e.

                                                                                

In [6]:
youtube.describe().show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|summary|         Video Title|           Video URL|             Views|    Publication Date|         Description|             Likes|         Channel URL|        Channel Name|       Channel Image|Channel Subscribers|           Comments|            tag|
+-------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|  count|             1252650|             1252650|           1252650|             1252650|             1252650|             10685|             1252650|             1252650|             1252650|            1252650|            1252650|        12526

                                                                                

In [7]:
def clean_text(text):
    # Deal with component words
    re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove Http / Https links in the text
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Handling repeated characters (more than 2)
    text = re.sub(r'(.)\1+', r'\1\1', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
# Tokenization
tokenizer = Tokenizer(inputCol="Comments_cleaned", outputCol="words")

# Stopwords Removal
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Vectorization methods
cv = CountVectorizer(inputCol="filtered", outputCol="cv_features")
#hashingTF = HashingTF(inputCol="filtered", outputCol="hashingTF_features")
idf = IDF(inputCol="cv_features", outputCol="tfidf_features")

# Pipeline
vec_pipeline = Pipeline(stages=[tokenizer, remover, cv, idf])

In [9]:
youtube = youtube.select(['Likes', 'Comments', 'tag'])

In [10]:
clean_text_udf = udf(clean_text, StringType())
filtered_youtube = youtube.withColumn('Comments_cleaned', clean_text_udf(youtube['Comments']))

In [11]:
filtered_youtube.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----+--------------------+-----+--------------------+
|Likes|            Comments|  tag|    Comments_cleaned|
+-----+--------------------+-----+--------------------+
|   74|It’s cuts both wa...|Books|its cuts both way...|
|   74|As a small time e...|Books|as a small time e...|
|   74|I was sent an emp...|Books|i was sent an emp...|
|   74|eBay is like loan...|Books|ebay is like loan...|
|   74|Hey man!   Videos...|Books|hey man videos li...|
|   74|After selling on ...|Books|after selling on ...|
|   74|About ebay. It ac...|Books|about ebay it act...|
|   74|eBay has its mome...|Books|ebay has its mome...|
|   74|Leftover residue ...|Books|leftover residue ...|
|   74|“….leave everythi...|Books|leave everything ...|
|   74|Wow!  Sorry those...|Books|wow sorry those t...|
|   74|I don’t think it’...|Books|i dont think its ...|
|   74|Was there a pictu...|Books|was there a pictu...|
|   74|I rather sell my ...|Books|i rather sell my ...|
|   74|Selling ANYTHING ...|Books|selling anythi

                                                                                

In [12]:
youtube_vec = vec_pipeline.fit(filtered_youtube).transform(filtered_youtube)
youtube_vec = youtube_vec.dropna()
youtube_vec.show()

23/11/26 21:23:16 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.9 MiB
23/11/26 21:23:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
[Stage 12:>                                                         (0 + 1) / 1]

+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|Likes|            Comments|  tag|    Comments_cleaned|               words|            filtered|         cv_features|      tfidf_features|
+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|   74|It’s cuts both wa...|Books|its cuts both way...|[its, cuts, both,...|[cuts, ways, im, ...|(262144,[1,3,21,2...|(262144,[1,3,21,2...|
|   74|As a small time e...|Books|as a small time e...|[as, a, small, ti...|[small, time, eba...|(262144,[5,9,15,2...|(262144,[5,9,15,2...|
|   74|I was sent an emp...|Books|i was sent an emp...|[i, was, sent, an...|[sent, empty, env...|(262144,[13,18,30...|(262144,[13,18,30...|
|   74|eBay is like loan...|Books|ebay is like loan...|[ebay, is, like, ...|[ebay, like, loan...|(262144,[0,5,82,1...|(262144,[0,5,82,1...|
|   74|Hey man!   Vi

                                                                                

In [13]:
youtube_vec.describe().show()

23/11/26 21:24:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:25:39 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB


+-------+------------------+--------------------+---------------+----------------+
|summary|             Likes|            Comments|            tag|Comments_cleaned|
+-------+------------------+--------------------+---------------+----------------+
|  count|             10685|               10685|          10685|           10685|
|   mean|  560.349742629855|   45812.42857142857|           null|            null|
| stddev|275.80694615498874|    42745.4099323625|           null|            null|
|    min|                10|
Can you please m...|          Books|                |
|    max|               998|                  🫡|problem-solving|              zz|
+-------+------------------+--------------------+---------------+----------------+



                                                                                

In [None]:
train_df, test_df = youtube_vec.randomSplit([0.8, 0.2], 2023)
tag_list = [row['tag'] for row in youtube_vec.select(col('tag')).distinct().collect()]
lr = LinearRegression(featuresCol="tfidf_features", labelCol="Likes")
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Likes")

for tag in tag_list:
    train_data = train_df.filter(col('tag') == tag)
    test_data = test_df.filter(col('tag') == tag)
    
    lrModel = lr.fit(train_data)
    predictions = lrModel.transform(test_data)
    
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

    print(tag)
    print("RMSE: %f" % rmse)
    print("MSE: %f" % mse)
    print("MAE: %f" % mae)
    print("R2: %f" % r2)

23/11/26 21:38:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:40:03 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.8 MiB
23/11/26 21:40:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:41:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:43:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:43:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:43:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:43:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:43:02 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

problem-solving
RMSE: 253.346581
MSE: 64184.490147
MAE: 162.617452
R2: 0.021872


23/11/26 21:50:56 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:52:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 21:53:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Relationship
RMSE: 252.269253
MSE: 63639.775935
MAE: 166.057463
R2: -0.221260


23/11/26 22:01:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:03:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:42 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:43 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:04:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Financial Advice
RMSE: 299.617750
MSE: 89770.795969
MAE: 196.632835
R2: -0.410099


23/11/26 22:12:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:14:04 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:32 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:33 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:15:34 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

Books
RMSE: 393.025423
MSE: 154468.983445
MAE: 278.966827
R2: -0.932577


23/11/26 22:23:25 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:24:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:22 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:26:24 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

interesting
RMSE: 165.083162
MSE: 27252.450334
MAE: 65.068416
R2: 0.566461


23/11/26 22:34:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:35:44 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:13 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.9 MiB
23/11/26 22:37:15 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

life tips
RMSE: 355.620074
MSE: 126465.636807
MAE: 180.235390
R2: -0.607050


                                                                                