In [None]:
import os
import sys
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

ratings = spark.read.csv("data/ratings.csv", header=True, inferSchema=True)
(training, test) = ratings.randomSplit([0.8, 0.2], 123)
als = ALS(maxIter=5, rank=70, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
            coldStartStrategy="drop")
als.setSeed(123)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


# Calculate the sparsity of the user-item rating matrix
# num_ratings = df.select("rating").count()
# num_users = df.select("userId").distinct().count()
# num_items = df.select("movieId").distinct().count()
# num_elements = num_users * num_items
# num_non_zero = df.select("rating").na.drop().count()
# sparsity = (num_non_zero / num_elements)*100




# # Set the minimum number of ratings per user and per item
# min_ratings_user = 20
# min_ratings_item = 10

# # Filter out users with fewer than min_ratings_user ratings
# user_counts = df.groupBy("userId").count().filter(col("count") >= min_ratings_user)
# df = df.join(user_counts, "userId", "inner")

# # Filter out items with fewer than min_ratings_item ratings
# item_counts = df.groupBy("movieId").count().filter(col("count") >= min_ratings_item)
# df = df.join(item_counts, "movieId", "inner")





# --- Building Model ---
# Compute the item-item similarity matrix
# vectorAssembler = VectorAssembler(inputCols=["userId", "movieId"], outputCol="features")
# df = vectorAssembler.transform(df)
# normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
# df = normalizer.transform(df)
# brp = BucketedRandomProjectionLSH(inputCol="normFeatures", outputCol="hashes", bucketLength=0.1, numHashTables=10)
# model = brp.fit(df)
# df = model.transform(df)
# similarity_df = model.approxSimilarityJoin(df, df, 1.0, distCol="cosineSimilarity").select(col("datasetA.movieId").alias("movieId1"), col("datasetB.movieId").alias("movieId2"), col("cosineSimilarity"))
# # filter to get only the top 10 with the highest similarity
# similarity_df = similarity_df.filter(col("movieId1") != col("movieId2")).orderBy(col("cosineSimilarity").desc()).limit(10)
# similarity_df.show(10)

# # Perform PCA on the similarity matrix
# vectorAssembler = VectorAssembler(inputCols=["cosineSimilarity"], outputCol="features")
# df = vectorAssembler.transform(similarity_df)
# scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
# scalerModel = scaler.fit(df)
# df = scalerModel.transform(df)
# pca = PCA(k=3, inputCol="scaledFeatures", outputCol="pcaFeatures")
# pcaModel = pca.fit(df)
# df = pcaModel.transform(df)

# # Get the top-k similar items for each item
# similarity_threshold = 0.8
# top_k = 2
# similarity_df = df.filter(col("pcaFeatures").getItem(0) >= similarity_threshold)
# similarity_df = similarity_df.orderBy(col("pcaFeatures").getItem(0).desc()).limit(top_k)
# similarity_df.show()





# take a subset of the data
# df = df.sample(0.00038461538, 111)
# df.show(100)
# (training, test) = df.randomSplit([0.8, 0.2], 123)
# training = training.groupBy("userId").pivot("movieId").agg({"rating": "first"}).na.fill(0)
# test = test.groupBy("userId").pivot("movieId").agg({"rating": "first"}).na.fill(0)
# training.show(100)
# test.show(100)