# Collaborative Filtering

In [None]:
!java --version
!python --version

openjdk 11.0.18 2023-01-17
OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1)
OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
Python 3.9.16


In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=afcec6fe78624b719440c588d91214a19516bcca111a290a3a2b4ecc7ab01b5c
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
# Import Apache Spark SQL
from pyspark.sql import SparkSession

# Create Spark Session/Context
# We are using local machine with all the CPU cores [*]
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
# Check spark session
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f35dddacc10>


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
import matplotlib.pyplot as plt

In [None]:
# Load the data from the local file into a Spark DataFrame
ratings = spark.read.text("/content/sample_movielens_ratings.txt")
ratings = ratings.selectExpr("split(value, '::')[0] as userId", "split(value, '::')[1] as movieId", "split(value, '::')[2] as rating", "split(value, '::')[3] as timestamp")

# Convert the "userId" and "movieId" columns to integers, and "rating" column to float
ratings = ratings.withColumn("userId", ratings["userId"].cast("int"))
ratings = ratings.withColumn("movieId", ratings["movieId"].cast("int"))
ratings = ratings.withColumn("rating", ratings["rating"].cast("float"))

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [None]:
maxIters = [5, 10, 15]
regParams = [0.1, 0.5, 1.0]

In [None]:
hyperparameters = []
rmse_values = []

In [None]:
# Loop over all combinations of hyperparameter values
for maxIter in maxIters:
    for regParam in regParams:
        # Build the recommendation model using ALS on the training data
        als = ALS(maxIter=maxIter, regParam=regParam, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
        model = als.fit(training)

        # Evaluate the model by computing the RMSE on the test data
        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        hyperparameters.append((maxIter, regParam))
        rmse_values.append(rmse)
        print("maxIter = {}, regParam = {}: Root-mean-square error = {}".format(maxIter, regParam, rmse))

maxIter = 5, regParam = 0.1: Root-mean-square error = 1.1132879880195798
maxIter = 5, regParam = 0.5: Root-mean-square error = 1.334301958426809
maxIter = 5, regParam = 1.0: Root-mean-square error = 1.6253551233362817
maxIter = 10, regParam = 0.1: Root-mean-square error = 1.0634445228935412
maxIter = 10, regParam = 0.5: Root-mean-square error = 1.3332422266129222
maxIter = 10, regParam = 1.0: Root-mean-square error = 1.6253546063609272
maxIter = 15, regParam = 0.1: Root-mean-square error = 1.0637379858202258
maxIter = 15, regParam = 0.5: Root-mean-square error = 1.3333354619898368
maxIter = 15, regParam = 1.0: Root-mean-square error = 1.6253546149602884


In [None]:
spark.stop()