# Exploring Spark ML regression models for movie ratings prediction

In this project, we explore Spark ML to predict movie ratings using various ML and regression models.

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# Create a SparkSession as the entry point to Spark SQL functionality
spark = SparkSession.builder.appName("MovieLens Regression Model").getOrCreate()

### Data import and preparation

In [3]:
# Load the movie ratings dataset
ratings_df = spark.read.csv("ml-100k/ml-100k/u.data", sep="\t", header=False, inferSchema=True)
ratings_df = ratings_df.withColumnRenamed("_c0", "user_id").withColumnRenamed("_c1", "movie_id").withColumnRenamed("_c2", "rating").withColumnRenamed("_c3", "timestamp")

# Load the users dataset
users_df = spark.read.csv("ml-100k/ml-100k/u.user", sep="|", header=False, inferSchema=True)
users_df = users_df.withColumnRenamed("_c0", "user_id").withColumnRenamed("_c1", "age").withColumnRenamed("_c2", "gender").withColumnRenamed("_c3", "occupation")

# Load the movies dataset
movies_df = spark.read.csv("ml-100k/ml-100k/u.item", sep="|", header=False, inferSchema=True)
movies_df = movies_df.withColumnRenamed("_c0", "movie_id").withColumnRenamed("_c1", "title").withColumnRenamed("_c2", "release_date")

# Merge datasets
ratings_with_features_df = ratings_df.join(users_df, on="user_id", how="left").join(movies_df, on="movie_id", how="left")

# Select relevant features for modeling
# The * is used to unpack the list so that each element becomes a separate argument to the select function
feature_columns = ["age", "gender", "occupation", "title"]
ratings_with_features_df = ratings_with_features_df.select("rating", *feature_columns)
ratings_with_features_df.show(5)

+------+---+------+----------+--------------------+
|rating|age|gender|occupation|               title|
+------+---+------+----------+--------------------+
|     3| 49|     M|    writer|        Kolya (1996)|
|     3| 39|     F| executive|L.A. Confidential...|
|     1| 25|     M|    writer| Heavyweights (1994)|
|     2| 28|     M|technician|Legends of the Fa...|
|     1| 47|     M|  educator| Jackie Brown (1997)|
+------+---+------+----------+--------------------+
only showing top 5 rows



In [4]:
# Convert categorical features to numerical using StringIndexer
# We create a list of StringIndexer objects, one for each categorical column: gender and occupation
# Each StringIndexer converts the categorical values in its input column to numerical indices and creates a new column suffixed with _index
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings_with_features_df) for column in ["gender", "occupation", "title"]]
# Apply the StringIndexer transformations to the dataset
# We iterate over the list of indexers and transforms the dataframe by adding new columns with the indexed values for each categorical column
for indexer in indexers:
    ratings_with_features_df = indexer.transform(ratings_with_features_df)
ratings_with_features_df.show(5)

+------+---+------+----------+--------------------+------------+----------------+-----------+
|rating|age|gender|occupation|               title|gender_index|occupation_index|title_index|
+------+---+------+----------+--------------------+------------+----------------+-----------+
|     3| 49|     M|    writer|        Kolya (1996)|         0.0|             6.0|      287.0|
|     3| 39|     F| executive|L.A. Confidential...|         1.0|             9.0|       38.0|
|     1| 25|     M|    writer| Heavyweights (1994)|         0.0|             6.0|     1045.0|
|     2| 28|     M|technician|Legends of the Fa...|         0.0|             8.0|      413.0|
|     1| 47|     M|  educator| Jackie Brown (1997)|         0.0|             2.0|      259.0|
+------+---+------+----------+--------------------+------------+----------------+-----------+
only showing top 5 rows



In [5]:
# Assemble features into a single vector
# We create a VectorAssembler object that combines multiple input columns into a single output vector column
assembler = VectorAssembler(inputCols=[column+"_index" for column in ["gender", "occupation", "title"]] + ["age"], outputCol="features")
# This step creates the features column, which contains the assembled feature vector for each row
ratings_with_features_df = assembler.transform(ratings_with_features_df)
ratings_with_features_df.show(5)

# Split the dataset into train and test sets
train_data, test_data = ratings_with_features_df.randomSplit([0.8, 0.2], seed=123)

+------+---+------+----------+--------------------+------------+----------------+-----------+--------------------+
|rating|age|gender|occupation|               title|gender_index|occupation_index|title_index|            features|
+------+---+------+----------+--------------------+------------+----------------+-----------+--------------------+
|     3| 49|     M|    writer|        Kolya (1996)|         0.0|             6.0|      287.0|[0.0,6.0,287.0,49.0]|
|     3| 39|     F| executive|L.A. Confidential...|         1.0|             9.0|       38.0| [1.0,9.0,38.0,39.0]|
|     1| 25|     M|    writer| Heavyweights (1994)|         0.0|             6.0|     1045.0|[0.0,6.0,1045.0,2...|
|     2| 28|     M|technician|Legends of the Fa...|         0.0|             8.0|      413.0|[0.0,8.0,413.0,28.0]|
|     1| 47|     M|  educator| Jackie Brown (1997)|         0.0|             2.0|      259.0|[0.0,2.0,259.0,47.0]|
+------+---+------+----------+--------------------+------------+----------------

### Model training and evaluation

In [6]:
# Define the models
lr = LinearRegression(featuresCol="features", labelCol="rating")
lasso = LinearRegression(featuresCol="features", labelCol="rating", elasticNetParam=1.0)
ridge = LinearRegression(featuresCol="features", labelCol="rating", regParam=0.1)
elastic_net = LinearRegression(featuresCol="features", labelCol="rating", elasticNetParam=0.5)
gls = GeneralizedLinearRegression(featuresCol="features", labelCol="rating")
decision_tree = DecisionTreeRegressor(featuresCol="features", labelCol="rating", maxBins=2000)

# Create a list of models
models = [lr, lasso, ridge, elastic_net, gls, decision_tree]
model_names = ["Linear Regression (OLS)", "Lasso", "Ridge", "Elastic Net", "GLS", "Decision Tree"]

# Perform model training and evaluation
for model, name in zip(models, model_names):
    # Train the model
    model_fit = model.fit(train_data)

    # Make predictions on the test data
    predictions = model_fit.transform(test_data)
    print(f"\n{name} predictions:")
    predictions.show(5)

    # Evaluate the model using RMSE
    evaluator_rmse = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
    rmse = evaluator_rmse.evaluate(predictions)
    evaluator_mae = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="mae")
    mae = evaluator_mae.evaluate(predictions)
    evaluator_r2 = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="r2")
    r2 = evaluator_r2.evaluate(predictions)

    print("Metrics:")
    print(f"{name} RMSE:", rmse)
    print(f"{name} MAE:", mae)
    print(f"{name} R-squared:", r2)


Linear Regression (OLS) predictions:
+------+---+------+----------+--------------------+------------+----------------+-----------+--------------------+------------------+
|rating|age|gender|occupation|               title|gender_index|occupation_index|title_index|            features|        prediction|
+------+---+------+----------+--------------------+------------+----------------+-----------+--------------------+------------------+
|     1| 10|     M|   student|Beauty and the Be...|         0.0|             0.0|      114.0|[0.0,0.0,114.0,10.0]|3.6023699284437596|
|     1| 11|     M|      none|Batman & Robin (1...|         0.0|            17.0|      528.0|[0.0,17.0,528.0,1...|3.0967906559021756|
|     1| 13|     F|   student|Everyone Says I L...|         1.0|             0.0|      172.0|[1.0,0.0,172.0,13.0]|3.5933585804634505|
|     1| 13|     F|   student|George of the Jun...|         1.0|             0.0|      181.0|[1.0,0.0,181.0,13.0]|3.5849668421605605|
|     1| 13|     F|   st

In [7]:
# Stop SparkSession
spark.stop()