In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
df = spark.table("silverreview").dropna(subset=["text", "stars"])

In [0]:
label_indexer = StringIndexer(inputCol="stars", outputCol="label", handleInvalid="skip")

In [0]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

In [0]:
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [0]:
lr = LogisticRegression(maxIter=15, regParam=0.1, labelCol="label", featuresCol="features")

In [0]:
pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, hashingTF, idf, lr])

In [0]:
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_data)

In [0]:
predictions = model.transform(test_data)
display(predictions.select("text", "stars", "prediction"))

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")

In [0]:
train_data, val_data, test_data = df.randomSplit([0.7, 0.15, 0.15], seed=42)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

paramGrid = (ParamGridBuilder()
    .addGrid(lr.maxIter, [10, 20, 50])
    .addGrid(lr.regParam, [0.01, 0.1, 0.5])
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

In [0]:
bestModel = None
bestMetric = float("-inf")
bestParams = None

for params in paramGrid:
    model = pipeline.fit(train_data, params)
    preds = model.transform(val_data)
    metric = evaluator.evaluate(preds)
    if metric > bestMetric:
        bestMetric = metric
        bestModel = model
        bestParams = params

In [0]:
test_preds = bestModel.transform(test_data)
test_accuracy = evaluator.evaluate(test_preds)

print(f"Best validation accuracy: {bestMetric:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"Best params: {bestParams}")