In [1]:
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, DenseVector
from pyspark.ml.feature import VectorAssembler, Normalizer
from pyspark.ml.stat import Correlation
from pyspark.sql.types import DoubleType
spark.conf.set("spark.sql.pivotMaxValues", "60000")

## Import the dataset

In [2]:
data = spark.read.csv("gs://mit805_data_bucket/data_raw/final_animedataset.csv", header = True, inferSchema=True).limit(1000)
# Select only the required columns
anime_ratings = data.select("user_id", "anime_id", "my_score")
anime_ratings.show(5)



+-------+--------+--------+
|user_id|anime_id|my_score|
+-------+--------+--------+
|2255153|      21|       9|
|2255153|      59|       7|
|2255153|      74|       7|
|2255153|     120|       7|
|2255153|     178|       7|
+-------+--------+--------+
only showing top 5 rows



                                                                                

## Create the ALS recommendation model

In [3]:
from pyspark.ml.recommendation import ALS

# Set up the ALS model
als = ALS(maxIter=10, 
          regParam=0.1, 
          userCol="user_id", 
          itemCol="anime_id", 
          ratingCol="my_score", 
          coldStartStrategy="drop")

# Fit the model
model = als.fit(anime_ratings)


                                                                                

## Generate top 10 recommendations

In [4]:
user_recommendations = model.recommendForAllUsers(10)


In [5]:
user_recommendations.show()



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|2255153|[{966, 10.052982}...|
|  37326|[{801, 10.00388},...|
|1897606|[{34599, 9.954706...|
+-------+--------------------+



                                                                                

## Generate recommendations

In [8]:
# Using equals condition
user_recommendations.filter(user_recommendations.user_id == 2255153).show(truncate=False)



+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                  |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|2255153|[{966, 10.052982}, {14713, 9.844229}, {857, 9.811249}, {269, 9.587059}, {1579, 9.047685}, {1735, 8.949792}, {7054, 8.924796}, {21, 8.690091}, {1974, 8.042386}, {3731, 8.042386}]|
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



                                                                                

Here we have the recommendations for a particular user, lets do a join to see the names associated with the anime_ids

In [9]:
# Assuming `user_recommendations` contains the recommendations for each user
# and `anime_data_spark` is the original DataFrame with all the data

from pyspark.sql.functions import explode

# Explode the recommendations column to have one row per recommendation
exploded_recommendations = user_recommendations.withColumn("rec", explode(user_recommendations.recommendations))

# Extract the anime_id and rating from the recommendation
recommendations_with_titles = exploded_recommendations.select(
    "user_id",
    "rec.anime_id",
    "rec.rating"
)

# Join with the original dataset to get the title
final_recommendations = recommendations_with_titles.join(
    data.select("anime_id", "title").distinct(),
    on="anime_id",
    how="left"
)

In [10]:
final_recommendations.filter(user_recommendations.user_id == 2255153).show(truncate=False)

                                                                                

+--------+-------+---------+---------------------+
|anime_id|user_id|rating   |title                |
+--------+-------+---------+---------------------+
|966     |2255153|10.052982|null                 |
|14713   |2255153|9.844229 |null                 |
|857     |2255153|9.811249 |null                 |
|269     |2255153|9.587059 |Bleach               |
|1579    |2255153|9.047685 |null                 |
|1735    |2255153|8.949792 |Naruto: Shippuuden   |
|7054    |2255153|8.924796 |Kaichou wa Maid-sama!|
|21      |2255153|8.690091 |One Piece            |
|1974    |2255153|8.042386 |null                 |
|3731    |2255153|8.042386 |Itazura na Kiss      |
+--------+-------+---------+---------------------+

