Load datasets
-----------------

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Recommendation ALS").getOrCreate()

# do something to prove it works
movies_df = spark.read.option("header", "true").csv("data/movies.csv", inferSchema=True)
links_df = spark.read.option("header", "true").csv("data/links.csv", inferSchema=True)
movies_df = movies_df.join(links_df, on = ['movieId'])
ratings_df = spark.read.option("header", "true").csv("data/ratings.csv", inferSchema=True)
tags_df = spark.read.option("header", "true").csv("data/tags.csv", inferSchema=True)

# movies_df.show()

Compute the item feature vector
------

In [2]:
from pyspark.sql.functions import log10
from pyspark.sql.functions import col
import math

tf = tags_df.groupBy(["movieId", "tag"]).count().selectExpr("movieId", "tag","count AS tag_count_tf")

tags_distinct = tags_df.selectExpr("movieId", "tag").dropDuplicates()
df = tags_distinct.groupBy("tag").count().selectExpr("tag", "count AS tag_count_df")
idf = math.log10(tags_df.select("movieId").distinct().count())
df = df.withColumn("idf", idf - log10("tag_count_df"))

tf = tf.join(df, on = "tag", how = "left")
tf = tf.withColumn("tf-idf", col("tag_count_tf") * col("idf"))
# show TF-IDF values for each movie
# tf.select("movieId", "tag", "tf-idf").show()

Calculate unit length vector of TF-IDF for normalization
------

In [3]:
from pyspark.sql.functions import col
from pyspark.sql.functions import sqrt

vect_len = tf.select("movieId","tf-idf")
vect_len = vect_len.withColumn("tf-idf-sq", col("tf-idf")**2)
vect_len = vect_len.groupby("movieId").sum().withColumnRenamed("sum(tf-idf)", "tf-idf-sum")\
    .withColumnRenamed("sum(tf-idf-sq)", "tf-idf-sq-sum")
vect_len = vect_len.withColumn("vect_length", sqrt("tf-idf-sq-sum"))
tf = tf.join(vect_len,on = "movieId", how = "left")
tf = tf.withColumn("tag_vec", col("tf-idf")/col("vect_length"))

# display the feature unit length vector of each movie: 'tag_vec'
# tf.filter(tf["movieId"] == 60756).select("movieId","tag","tf-idf","vect_length", "tag_vec").show()

Let’s implement the same and calculate user profile for each user.

In [4]:
from pyspark.sql.functions import lit

ratings_filter = ratings_df.filter(ratings_df["rating"] > 3)
distinct_users = ratings_df.select("userId").distinct()

#enter user ID for analysis
userId = 65
user_data= ratings_filter.filter(ratings_filter["userId"] == userId)
user_data = tf.join(user_data, on = "movieId", how = "inner")

user_tag_pref = user_data.groupby("tag").sum().withColumnRenamed("sum(tag_vec)", "tag_pref")\
    .select("tag","tag_pref")
user_tag_pref = user_tag_pref.withColumn("user", lit(userId))
# user_tag_pref.filter(user_tag_pref["tag"] == "Boxing story").show()

Step 4. Compute the cosine similarities and predict item ratings
--------

In [5]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F
import math

distinct_movies = tf.select("movieId").distinct()
for movieId in distinct_movies.collect():
    print(movieId[0])    
#     tf_movies = tf.filter(tf["movieId"] == movieId[0])

#     tag_merge = tf_movies.join(user_tag_pref, on = "tag", how = "left")
#     tag_merge.fillna({"tag_pref": 0})
#     tag_merge.withColumn("tag_value", col("tag_vec") * col("tag_pref"))
    
#     tag_vec_val = math.sqrt(tag_merge.agg(F.sum("tag_vec")).collect()[0][0])
#     print("Movie id {} tag_vec {}".format(movieId[0], tag_vec_val))
                       
#     tag_vec_val = np.sqrt(np.sum(np.square(tag_merge['tag_vec']), axis=0))
#     tag_pref_val = np.sqrt(np.sum(np.square(user_tag_pref_all['tag_pref']), axis=0))
        
#     tag_merge_final = tag_merge.groupby(['user','movieId'])[['tag_value']]\
#                                    .sum()\
#                                    .rename(columns = {'tag_value': 'rating'})\
#                                    .reset_index()
        
#     tag_merge_final['rating']=tag_merge_final['rating']/(tag_vec_val*tag_pref_val)
        
#     tag_merge_all = tag_merge_all.append(tag_merge_final, ignore_index=True)

471
1088
1580
1645
1959
2122
3175
6466
6620
7833
8638
540
858
1025
1084
1721
2387
3475
4190
6773
48780
69481
31
516
1270
1303
1650
2393
2572
3000
4489
80906
82459
1975
3098
1223
1977
2249
3566
5682
52712
101142
108932
152711
588
898
970
2247
3028
4477
5527
5995
6552
6832
6890
7584
296
1466
3306
4396
4612
8950
513
918
2396
6378
9018
53127
62434
68954
673
3087
4259
4725
58559
104875
187593
593
597
976
1212
1653
2797
6157
6188
6732
7132
35015
148626
950
3844
7218
126548
34
1068
1198
1344
1537
3152
4263
6639
8011
8464
8961
101
1183
1500
148881
830
1259
2662
4896
5254
6953
7649
28
1201
1441
1496
2313
2463
4326
8827
155288
497
596
1243
1274
1296
2355
2583
2905
6667
7147
8622
27741
90769
300
412
1030
6820
8809
32584
136864
587
1276
1307
6288
7088
71535
914
1372
2467
2871
3435
5505
6863
107406
26
1235
2023
2335
3211
3270
3811
5875
6936
6948
27020
30793
38886
1269
2750
3072
33154
40491
135536
908
916
1135
1350
3481
3528
111743
1148
1210
1280
1363
1955
2076
2804
7023
112552
329
1398
3258
3456
35

References
--------------

* [Content Based Recommender System in Python](https://medium.com/@tomar.ankur287/content-based-recommender-system-in-python-2e8e94b16b9e)

* [Data Science Series: Content-based Recommender System using Azure Databricks](https://visualbi.com/blogs/business-intelligence/data-science/data-science-series-content-based-recommender-system-using-azure-databricks/)