In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, RegexTokenizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import lower, col

In [0]:
raw_df = spark.table("silverreview").dropna(subset=["text", "stars"])

In [0]:
df = raw_df.withColumn("text_lower", lower(col("text")))

label_indexer = StringIndexer(inputCol="stars", outputCol="label", handleInvalid="skip")
tokenizer = RegexTokenizer(inputCol="text_lower", outputCol="words", pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") \
    .setStopWords(StopWordsRemover.loadDefaultStopWords("english"))
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [0]:
feature_pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, hashingTF, idf])
processed_df = feature_pipeline.fit(df).transform(df)

In [0]:
(train_data, test_data) = processed_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [0]:
model = rf.fit(train_data)
predictions = model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2%}")