In [3]:
# Always needs to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('reco').getOrCreate()

In [4]:
# Import recomender system from pyspark
from pyspark.ml.recommendation import ALS

In [5]:
# Evaluate
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
# Import data
data = spark.read.csv('/home/baxman/Codes/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Recommender_Systems/movielens_ratings.csv', inferSchema=True,header=True)

In [7]:
data.show()

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
|     23|   1.0|     0|
|     26|   3.0|     0|
|     27|   1.0|     0|
|     28|   1.0|     0|
|     29|   1.0|     0|
|     30|   1.0|     0|
|     31|   1.0|     0|
|     34|   1.0|     0|
|     37|   1.0|     0|
|     41|   2.0|     0|
+-------+------+------+
only showing top 20 rows



In [8]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [32]:
# Split into train/test
train,test = data.randomSplit([0.7,0.3])

In [33]:
# Set up model with following required data
als = ALS(maxIter = 5,regParam = 0.01,userCol='userId', itemCol='movieId', ratingCol='rating')

In [37]:
model = als.fit(train)

In [38]:
predictions = model.transform(test)

In [39]:
# Shwoing predictions
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    13|-0.67213863|
|     31|   1.0|    19|  2.1562886|
|     31|   3.0|    14|  2.0734408|
|     85|   1.0|     5|  1.2307732|
|     85|   1.0|    25|  2.6448324|
|     85|   3.0|    21|-0.13779822|
|     65|   1.0|    28|  -2.023874|
|     65|   2.0|     5|  0.7804751|
|     65|   5.0|    23|  1.6782148|
|     53|   3.0|    13|  3.5872848|
|     53|   3.0|    20|  1.0536995|
|     53|   2.0|    19|   3.728093|
|     53|   1.0|     7|  1.4678236|
|     53|   1.0|    25|   5.169369|
|     78|   1.0|    12|-0.19373994|
|     78|   1.0|     1| 0.84418267|
|     78|   1.0|    20|  1.0176991|
|     34|   1.0|    19| 0.83743197|
|     34|   1.0|    17|  1.7910146|
|     34|   3.0|    25|-0.67874223|
+-------+------+------+-----------+
only showing top 20 rows



In [40]:
# Formal evaluation
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

In [41]:
rmse = evaluator.evaluate(predictions)

In [42]:
print("RMSE is: ", rmse)

RMSE is:  2.1486263891773993


In [43]:
# Making this for a single user
single_user = test.filter(test['userId']==7).select(['movieId','userId'])

In [45]:
# All movies user_id = 7 watched
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      2|     7|
|      4|     7|
|     10|     7|
|     14|     7|
|     15|     7|
|     23|     7|
|     26|     7|
|     29|     7|
|     32|     7|
|     42|     7|
|     47|     7|
|     53|     7|
|     62|     7|
|     63|     7|
|     66|     7|
|     76|     7|
|     77|     7|
|     86|     7|
|     92|     7|
|     96|     7|
+-------+------+
only showing top 20 rows



In [46]:
# Predict what movie should they watch and what would they rate?
recommendations = model.transform(single_user)

In [49]:
recommendations.orderBy('prediction').show()

+-------+------+------------+
|movieId|userId|  prediction|
+-------+------+------------+
|     92|     7|  -0.8675993|
|     77|     7|-0.087118804|
|     76|     7|  0.22567229|
|     32|     7|  0.30996495|
|     26|     7|  0.34033164|
|     23|     7|  0.41439438|
|     15|     7|   0.7112619|
|     86|     7|  0.78329915|
|     10|     7|   1.4308174|
|     53|     7|   1.4678236|
|     96|     7|    1.855931|
|     97|     7|   2.0422788|
|      4|     7|   2.0465243|
|     66|     7|   2.3279462|
|      2|     7|    2.782993|
|     14|     7|    2.852174|
|     47|     7|   3.0507226|
|     42|     7|   3.0746427|
|     62|     7|    3.320275|
|     63|     7|   3.6766644|
+-------+------+------------+
only showing top 20 rows



In [None]:
# So movie_id 63 should be good!