# Exploring Spark ML classification models for movie ratings prediction

In this project, we explore Spark ML to classify movie ratings using various classification models.

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, LinearSVC
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("Movie Rating Classification").getOrCreate()

### Data import and preparation

In [3]:
# Load the ratings dataset
ratings_df = spark.read.csv("ml-100k/ml-100k/u.data", sep="\t", header=False, inferSchema=True)
ratings_df = ratings_df.withColumnRenamed("_c0", "user_id").withColumnRenamed("_c1", "movie_id").withColumnRenamed("_c2", "rating").withColumnRenamed("_c3", "timestamp")

# Load the users dataset
users_df = spark.read.csv("ml-100k/ml-100k/u.user", sep="|", header=False, inferSchema=True)
users_df = users_df.withColumnRenamed("_c0", "user_id").withColumnRenamed("_c1", "age").withColumnRenamed("_c2", "gender").withColumnRenamed("_c3", "occupation")

# Load the movies dataset
movies_df = spark.read.csv("ml-100k/ml-100k/u.item", sep="|", header=False, inferSchema=True)
movies_df = movies_df.withColumnRenamed("_c0", "movie_id").withColumnRenamed("_c1", "title").withColumnRenamed("_c2", "release_date")

# Merge datasets
ratings_df = ratings_df.join(users_df, on="user_id", how="left").join(movies_df, on="movie_id", how="left")

# Select the relevant features
feature_columns = ["age", "gender", "occupation"]
ratings_df = ratings_df.select("rating", *feature_columns)
ratings_df.show(5)

+------+---+------+----------+
|rating|age|gender|occupation|
+------+---+------+----------+
|     3| 49|     M|    writer|
|     3| 39|     F| executive|
|     1| 25|     M|    writer|
|     2| 28|     M|technician|
|     1| 47|     M|  educator|
+------+---+------+----------+
only showing top 5 rows



In [4]:
# Define a threshold for binary classification
threshold = 4
# Add a new column indicating whether the user liked the movie (1) or not (0)
ratings_df = ratings_df.withColumn("liked", (ratings_df["rating"] >= threshold).cast("int"))
ratings_df.show(5)

+------+---+------+----------+-----+
|rating|age|gender|occupation|liked|
+------+---+------+----------+-----+
|     3| 49|     M|    writer|    0|
|     3| 39|     F| executive|    0|
|     1| 25|     M|    writer|    0|
|     2| 28|     M|technician|    0|
|     1| 47|     M|  educator|    0|
+------+---+------+----------+-----+
only showing top 5 rows



In [5]:
# Convert categorical features to numerical
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings_df) for column in ["gender", "occupation"]]
for indexer in indexers:
    ratings_df = indexer.transform(ratings_df)

# Convert features into a single vector
assembler = VectorAssembler(inputCols=["age", "gender_index", "occupation_index"], outputCol="features")
ratings_df = assembler.transform(ratings_df)

# Split into train and test sets
train_data, test_data = ratings_df.randomSplit([0.8, 0.2], seed=123)

### Model training and evaluation

In [6]:
# Define the classification models
lr = LogisticRegression(featuresCol="features", labelCol="liked")
rf = RandomForestClassifier(featuresCol="features", labelCol="liked")
dt = DecisionTreeClassifier(featuresCol="features", labelCol="liked")
svm = LinearSVC(featuresCol="features", labelCol="liked")

models = [lr, rf, dt, svm]
model_names = ["Logistic Regression", "Random Forest", "Decision Tree", "Support Vector Machine"]

# Define the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.5, 1.0]) \
    .addGrid(rf.numTrees, [50, 70]) \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(svm.regParam, [0.1, 0.01]) \
    .build()

# Perform model training and evaluation with cross validation
for model, name in zip(models, model_names):
    # Set up the pipeline with feature scaling
    scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
    pipeline = Pipeline(stages=[scaler, model])
    # Set up cross validation
    crossval = CrossValidator(estimator=model,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(labelCol="liked", rawPredictionCol="rawPrediction", metricName="areaUnderROC"),
                              numFolds=3)

    # Train the model
    model_fit = crossval.fit(train_data)

    # Make predictions on the test data
    predictions = model_fit.transform(test_data)
    print(f"\n{name} predictions:")
    predictions.show(5)

    # Evaluate the model using area under ROC curve
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="liked", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator_acc.evaluate(predictions)
    evaluator_precision = MulticlassClassificationEvaluator(labelCol="liked", predictionCol="prediction", metricName="weightedPrecision")
    precision = evaluator_precision.evaluate(predictions)
    evaluator_recall = MulticlassClassificationEvaluator(labelCol="liked", predictionCol="prediction", metricName="weightedRecall")
    recall = evaluator_recall.evaluate(predictions)
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="liked", predictionCol="prediction", metricName="f1")
    f1 = evaluator_f1.evaluate(predictions)
    evaluator_auc = BinaryClassificationEvaluator(labelCol="liked", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
    auc = evaluator_auc.evaluate(predictions)

    print(f"\n{name} evaluation metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")


Logistic Regression predictions:
+------+---+------+----------+-----+------------+----------------+---------------+--------------------+--------------------+----------+
|rating|age|gender|occupation|liked|gender_index|occupation_index|       features|       rawPrediction|         probability|prediction|
+------+---+------+----------+-----+------------+----------------+---------------+--------------------+--------------------+----------+
|     1| 10|     M|   student|    0|         0.0|             0.0| [10.0,0.0,0.0]|[-0.1088798148012...|[0.47280690508373...|       1.0|
|     1| 11|     M|      none|    0|         0.0|            17.0|[11.0,0.0,17.0]|[-0.0478344688206...|[0.48804366251882...|       1.0|
|     1| 13|     F|   student|    0|         1.0|             0.0| [13.0,1.0,0.0]|[-0.1258737330627...|[0.46857305034524...|       1.0|
|     1| 13|     F|   student|    0|         1.0|             0.0| [13.0,1.0,0.0]|[-0.1258737330627...|[0.46857305034524...|       1.0|
|     1| 13|  

In [7]:
# Stop the SparkSession
spark.stop()