In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, DateType, FloatType, IntegerType, TimestampType, ArrayType, StructType, StructField
from pyspark.sql.functions import from_unixtime, sum, rank,lag, explode, expr,spark_partition_id, to_date, coalesce, lit, to_timestamp, col, month, concat, count, max, when, dayofweek, datediff,dense_rank, desc, date_format
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec,HashingTF,IDF, CountVectorizer,VectorAssembler
from pyspark.sql.functions import udf
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml import Pipeline,PipelineModel
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import LemmatizerModel
from pyspark.ml.classification import LinearSVC, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import sparknlp
import warnings

In [5]:
# remove all warnings
warnings.filterwarnings('ignore')

In [6]:
# spark = SparkSession.builder.appName('SparkBasics').getOrCreate()
spark = SparkSession.builder.appName('ml').getOrCreate()

# Get the context of the Pyspark environment
spark.sparkContext.getConf().getAll()
# Store spark context as a variable
sc = spark.sparkContext

In [7]:
reddit_data_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group2_Final_Project/reddit_data/",header=True, inferSchema=True)
reddit_data_df = reddit_data_df.dropna()

reddit_data_df.printSchema()
reddit_data_df.select(col("subreddit_name_prefixed")).show()

                                                                                

root
 |-- archived: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- body: string (nullable = true)
 |-- comment_type: string (nullable = true)
 |-- controversiality: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: string (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: string (nullable = true)
 |-- name: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- retrieved_on: string (nullable = true)
 |-- score: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: string (nullable = true)





+-----------------------+
|subreddit_name_prefixed|
+-----------------------+
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
|                  r/DIY|
+-----------------------+
only showing top 20 rows



                                                                                

In [8]:
# for debug only, slice datasets
# Assuming df is your original DataFrame
games_df = reddit_data_df.filter(col("subreddit_name_prefixed") == "r/Games").limit(100)
diy_df = reddit_data_df.filter(col("subreddit_name_prefixed") == "r/DIY").limit(100)

# If you need to combine these two DataFrames
combined_df = games_df.union(diy_df)

# Show the result
combined_df.groupBy(col("subreddit_name_prefixed")).agg(count("*")).show()


                                                                                

+-----------------------+--------+
|subreddit_name_prefixed|count(1)|
+-----------------------+--------+
|                  r/DIY|     100|
|                r/Games|     100|
+-----------------------+--------+



In [9]:
# Tokenize and stop word removal
def clean_text(text):
    # Deal with component words
    re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove Http / Https links in the text
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Handling repeated characters (more than 2)
    text = re.sub(r'(.)\1+', r'\1\1', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
# for debug only -- slice count datasets
reddit_count_df = combined_df.groupby(col("subreddit_name_prefixed")).agg(count("*").alias("reddit_count")).orderBy("reddit_count", ascending=False)
reddit_count_df.show()



+-----------------------+------------+
|subreddit_name_prefixed|reddit_count|
+-----------------------+------------+
|                  r/DIY|         100|
|                r/Games|         100|
+-----------------------+------------+



                                                                                

In [11]:
def encode_by_tags(df,labelcol):
    
    tags_lst = [row[labelcol] for row in df.select(col(labelcol)).distinct().collect()]
    df_res = df
    for tag in tags_lst:
        print(tag)
        df_res= df_res.withColumn(tag, when(col(labelcol) == tag, 1).otherwise(0))

    return tags_lst, df_res

In [12]:

def clean_df(df, inputcol):
    clean_text_udf = udf(clean_text, StringType())
    df_cleaned = df.withColumn(inputcol, clean_text_udf(df[inputcol]))
    return df_cleaned

def train_test_val_split(df, train_prob = 0.7, test_prob=0.2, val_prob= 0.1):
    train_df, test_df, validation_df = df.randomSplit([train_prob, test_prob, val_prob])
    return train_df, test_df, validation_df
    

def preprocess_pipeline(labelcol, inputcol,finfeaturecol):
    # tokenize the comments into words
    tokenizer = Tokenizer(inputCol=inputcol, outputCol="token")
    
    # remove stop words
    remover = StopWordsRemover(inputCol="token", outputCol="filtered_token")
    
    # vecotorize the words
    vectorizer = CountVectorizer(inputCol="filtered_token", outputCol="features")
    idf = IDF(inputCol="features", outputCol="tfidf_features")
    
    # assemble all features into 1 column
    assembler = VectorAssembler(inputCols=["features","tfidf_features"], outputCol=finfeaturecol)

    # Create the preprocessing piplines for the tweets
    pipeline = Pipeline().setStages([
        tokenizer,
        remover,
        vectorizer,
        idf,
        assembler
    ])
    return pipeline

def model_training_pipeline(featurecol, labelcol):
    svm = LinearSVC(labelCol=labelcol, featuresCol=featurecol)
    pipeline = Pipeline(stages=[svm])
    return svm,pipeline

In [13]:
def find_best_svm_hyperparameters(df,featurecol,labelCol):
    
    svm,pipeline = model_training_pipeline(featurecol, labelcol)
    # Set up the parameter grid
    paramGrid = ParamGridBuilder() \
        .addGrid(svm.maxIter, [10, 100]) \
        .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
        .build()

    # Evaluator
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol=labelCol)
    df.select(col(featurecol)).show()

    # Cross Validator
    crossval = CrossValidator(estimator=svm,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=3)

    # Run cross-validation and choose the best model
    print("start fitting")
    cvModel = crossval.fit(df)
    print(cvModel)

    return cvModel.bestModel

In [14]:

# clean dataframe
def get_model_specs(df,labelcol,inputcol,finfeaturecol,model_train):
    # model_dict
    model_lst = {}
    
    # model_spec
    model_spec = {}
    
    # encode the reddit_data_df into subreddits one-hot encoding df
    tags_lst, encoded_df = encode_by_tags(df,labelcol)
    
    # train the models by different tags
    # 更改这个标记
    for tag in tags_lst:
    
        f1_evaluator = MulticlassClassificationEvaluator(labelCol=tag, predictionCol="prediction", metricName="f1")
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=tag, predictionCol="prediction", metricName="accuracy")

        # use clean_df to clean up special characters and spaces
        df_cleaned = clean_df(encoded_df,inputcol)

        # train, test, val random split dataset
        train_df, test_df, val_df = train_test_val_split(df_cleaned)
        df_cleaned.select(tag)

        #Preprocess the dataframes
        preprocess = preprocess_pipeline(tag,inputcol,finfeaturecol)
        preprocess = preprocess.fit(train_df)
        train_processed = preprocess.transform(train_df).select(finfeaturecol, tag)
        test_processed = preprocess.transform(test_df).select(finfeaturecol, tag)
        validation_processed = preprocess.transform(val_df).select(finfeaturecol, tag)
        
        train_processed.show()
        
        svm, pipeline = model_train(finfeaturecol,tag)
         # Predict the model
        model = pipeline.fit(train_processed)
        predictions = model.transform(test_processed)
        val_predictions = model.transform(validation_processed)

        # get the acc and f1-score
        f1_score = f1_evaluator.evaluate(predictions)
        accuracy = accuracy_evaluator.evaluate(predictions)
        val_accuracy = accuracy_evaluator.evaluate(val_predictions)
        
        #save model and model spec
        model_lst[tag] = model
        model_spec[tag] = {"f1_score": f1_score, "accuracy": accuracy, "val_accuracy": val_accuracy}
        
#         # 添加到 model 文件夹
#         model.write().overwrite().save("gs://msca-bdp-student-gcs/Group2_Final_Project/test")
    
    return model_lst, model_spec

# Transform the data
labelcol = "subreddit_name_prefixed"
inputcol = "body"
finfeaturecol = "final_features"
model_lst, model_spec = get_model_specs(combined_df,labelcol,inputcol,finfeaturecol, model_training_pipeline)



                                                                                

r/DIY
r/Games


                                                                                

+--------------------+-----+
|      final_features|r/DIY|
+--------------------+-----+
|(2976,[0,1,3,4,9,...|    0|
|(2976,[1,20,21,28...|    0|
|(2976,[11,28,309,...|    0|
|(2976,[147,905,10...|    0|
|(2976,[115,134,14...|    0|
|(2976,[1,10,12,14...|    0|
|(2976,[10,51,255,...|    0|
|(2976,[7,30,39,66...|    0|
|(2976,[1,8,23,122...|    0|
|(2976,[7,46,99,12...|    0|
|(2976,[14,46,1010...|    0|
|(2976,[1,2,7,8,10...|    0|
|(2976,[1,7,10,28,...|    0|
|(2976,[1,7,20,28,...|    0|
|(2976,[23,27,44,4...|    0|
|(2976,[227,244,35...|    0|
|(2976,[1,14,21,34...|    0|
|(2976,[7,21,88,11...|    0|
|(2976,[21,23,28,3...|    0|
|(2976,[1082,2570]...|    0|
+--------------------+-----+
only showing top 20 rows



23/11/28 04:28:19 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/11/28 04:28:19 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

+--------------------+-------+
|      final_features|r/Games|
+--------------------+-------+
|(3176,[3,16,32,34...|      1|
|(3176,[3,12,22,38...|      1|
|(3176,[13,94,150,...|      1|
|(3176,[0,2,3,4,5,...|      1|
|(3176,[3,23,44,51...|      1|
|(3176,[21,23,297,...|      1|
|(3176,[118,289,17...|      1|
|(3176,[3,10,12,28...|      1|
|(3176,[3,13,106,1...|      1|
|(3176,[3,23,231,4...|      1|
|(3176,[463,2051],...|      1|
|(3176,[21,63,135,...|      1|
|(3176,[1,175,180,...|      1|
|(3176,[28,100,102...|      1|
|(3176,[12,215,289...|      1|
|(3176,[1,3,7,10,1...|      1|
|(3176,[1,3,10,23,...|      1|
|(3176,[1,3,23,51,...|      1|
|(3176,[0,26,40,69...|      1|
|(3176,[1,3,224,35...|      1|
+--------------------+-------+
only showing top 20 rows



                                                                                

In [15]:
print(model_spec)

{'r/DIY': {'f1_score': 0.6583416583416584, 'accuracy': 0.717948717948718, 'val_accuracy': 0.6666666666666666}, 'r/Games': {'f1_score': 0.7114431239388794, 'accuracy': 0.82, 'val_accuracy': 0.8333333333333334}}


In [16]:

# convert the model into dataframe for better visualization
model_data = [(tag, specs["f1_score"], specs["accuracy"],specs["val_accuracy"]) for tag, specs in model_spec.items()]

schema = StructType([
    StructField("model", StringType(), True),
    StructField("f1_score", FloatType(), True),
    StructField("accuracy", FloatType(), True),
    StructField("val_accuracy", FloatType(), True)
])

# Create DataFrame
model_df = spark.createDataFrame(model_data, schema)

model_df.show()



+-------+----------+----------+------------+
|  model|  f1_score|  accuracy|val_accuracy|
+-------+----------+----------+------------+
|  r/DIY|0.65834165|0.71794873|   0.6666667|
|r/Games| 0.7114431|      0.82|   0.8333333|
+-------+----------+----------+------------+



                                                                                

In [17]:
# save all the models
# for key in model_lst:
#     model_lst[key].write().overwrite().save("msca-bdp-student-gcs/Group2_Final_Project/model"+ key +"_model")