<a href="https://colab.research.google.com/github/dinhhungGM/RecommendationSystemUsingBigData/blob/main/Recommendation_System_BIGDATA_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import findspark
findspark.init()

In [8]:
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Group 7 - Recommendation System') \
.config('spark.sql.execution.arrow.pyspark.enabled', True)\
.config('spark.driver.memory','8G')\
.config('spark.ui.showConsoleProgress', True)\
.config('spark.sql.repl.eagerEval.enabled', True)\
.getOrCreate()

In [9]:
# Data is downloaded from https://www.kaggle.com/bandikarthik/movie-recommendation-system
movies = spark.read.csv('../MovieLens/movie.csv', header=True, inferSchema=True)
ratings = spark.read.csv('../MovieLens/rating.csv',  header=True, inferSchema=True)

                                                                                

In [4]:
movies.limit(5).show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+



In [4]:
ratings.limit(5).show()

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+



In [6]:
print(ratings.agg({"rating": "max"}).collect()[0])
print(ratings.agg({"rating": "min"}).collect()[0])

                                                                                

Row(max(rating)=5.0)




Row(min(rating)=0.5)


                                                                                

# Implementing ALS(Alternating Least Square) algorithm in Spark

In [15]:
# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False
          , coldStartStrategy="drop")


In [12]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [100]) \
            .addGrid(als.regParam, [.15]) \
            .build()
            #             .addGrid(als.maxIter, [5, 50, 100, 200]) \
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  1


In [16]:
# Build cross validation using CrossValidator
# numFolds=3 means the CrossValidator will create 3 different models.
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

In [17]:
# We fit the cross validator to the 'train' dataset
model = cv.fit(train)

# We Extract best model from the cv model above
best_model = model.bestModel

21/07/16 22:20:52 WARN CacheManager: Asked to cache already cached data.
21/07/16 22:20:52 WARN CacheManager: Asked to cache already cached data.
21/07/16 22:21:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/07/16 22:21:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [19]:
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)
print(f"Root mean square error: {rmse}")
print("====BEST MODEL ====")
print(f"BEST RANK: {best_model.rank}")
print(f"maxIter: {best_model._java_obj.parent().getMaxIter()}")
print(f"regParam: {best_model._java_obj.parent().getRegParam()}")



Root mean square error: 0.8143051599489648
====BEST MODEL ====
BEST RANK: 10
maxIter: 10
regParam: 0.1


                                                                                

In [20]:
predictions.agg({"prediction": "max"}).collect()[0]

                                                                                

Row(max(prediction)=6.4292802810668945)

# Movie Recommendation

In [21]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(10)
recommendations.limit(10).show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[{120821, 6.22960...|
|   463|[{3226, 6.3365936...|
|   471|[{3226, 5.771446}...|
|   496|[{121029, 6.44937...|
|   833|[{3226, 6.089091}...|
|  1088|[{3226, 5.434558}...|
|  1238|[{3226, 5.8392224...|
|  1342|[{121029, 6.59056...|
|  1580|[{120821, 5.34024...|
|  1591|[{3226, 6.2007923...|
+------+--------------------+



                                                                                

### 7th User’s Actual Preference:

In [11]:
ratings.join(movies, on='movieId').filter('userId = 7') \
.sort('rating', ascending=False).limit(10)

                                                                                

movieId,userId,rating,timestamp,title,genres
912,7,5.0,2002-01-16 18:09:56,Casablanca (1942),Drama|Romance
3179,7,5.0,2002-01-16 19:22:51,Angela's Ashes (1...,Drama
1077,7,5.0,2002-01-16 18:48:18,Sleeper (1973),Comedy|Sci-Fi
750,7,5.0,2002-01-16 18:44:19,Dr. Strangelove o...,Comedy|War
1196,7,5.0,2002-01-16 18:09:32,Star Wars: Episod...,Action|Adventure|...
587,7,5.0,2002-01-16 19:10:20,Ghost (1990),Comedy|Drama|Fant...
1210,7,5.0,2002-01-16 18:10:54,Star Wars: Episod...,Action|Adventure|...
1721,7,5.0,2002-01-16 19:06:05,Titanic (1997),Drama|Romance
2942,7,5.0,2002-01-16 18:38:41,Flashdance (1983),Drama|Romance
2028,7,5.0,2002-01-16 18:24:41,Saving Private Ry...,Action|Drama|War


### 7th User’s ALS Recommentions

In [23]:
recommendations = recommendations.withColumn("rec_exp", explode("recommendations")).select('userId', 
col("rec_exp.movieId"), col("rec_exp.rating"))
recommendations.join(movies, on='movieId').filter('userId = 7').show()

                                                                                

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|   3226|     7| 5.637633|Hellhounds on My ...|         Documentary|
| 121029|     7| 5.573067|No Distance Left ...|         Documentary|
| 120821|     7| 5.295107|The War at Home (...|     Documentary|War|
| 129536|     7|5.0036817|Code Name Coq Rou...|  (no genres listed)|
| 114070|     7|4.9300246|Good Job:  Storie...|         Documentary|
| 128366|     7|4.8328657|Patton Oswalt: Tr...|              Comedy|
| 117907|     7| 4.705026|My Brother Tom (2...|               Drama|
| 129451|     7| 4.669075|    Ingenious (2009)|Comedy|Drama|Romance|
| 112473|     7|4.6646147|Stuart: A Life Ba...|               Drama|
| 129243|     7| 4.609404|Afstiros katallil...|              Comedy|
+-------+------+---------+--------------------+--------------------+

