In [1]:
import findspark
findspark.init()
findspark.find()

'E:\\apps\\spark-3.3.0-bin-hadoop3'

In [2]:
from pyspark.sql.functions import col, explode
from pyspark.sql import SparkSession

Projek ini bertujuan untuk mempelajari konsep penggunaan salah satu aplikasi pengelola Big Data. Aplikasi yang dipilih adalah PySpark. Dalam projek ini, use-case yang dipilih adalah menerapkan konsep sistem rekomendasi. Sistem rekomendasi merupakan sistem yang digunakan untuk memprediksi jenis produk yang kemungkinan disukai oleh pengguna. Produk-produk tersebutlah yang akan direkomendasikan ke pengguna.

Sebagai contoh, Youtube yang mampu memberikan rekomendasi video kepada pengguna berdasarkan video yang pernah ditonton sebelumnya dan Spotify yang mampu memberikan rekomendasi lagu berdasarkan genre ataupun jenis lagu yang didengarkan pengguna.

## Memulai sesi Spark baru

In [3]:
spark = SparkSession.builder.appName('Recommendation').getOrCreate()

## Load Data .csv into spark

In [4]:
movies = spark.read.csv("file:///E:/tugas_koko/Semester_7/Big_Data_and_Analytics/Tugas/Tugas_UTS/als-recommender-pyspark/movies.csv",header=True)
ratings = spark.read.csv("file:///E:/tugas_koko/Semester_7/Big_Data_and_Analytics/Tugas/Tugas_UTS/als-recommender-pyspark/ratings.csv",header=True)

## Eksplorasi Data

#### Data Movies memiliki 3 kolom, yaitu movieID, title, dan Genres

In [5]:
movies.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



#### Jumlah baris yang terdapat pada dataset movies

In [6]:
movies.count()

9742

#### Nama tiap Kolom pada Dataset Movies

In [11]:
movies.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



#### Dataset ratings memiliki 4 kolom, yaitu userId, movieId, rating, timestamp

In [12]:
ratings.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



#### Jumlah baris yang terdapat pada Dataset ratings

In [13]:
ratings.count()

100836

#### Nama tiap Kolom pada Dataset ratings

In [14]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



#### Menghilangkan kolom timestamp. Kolom ini didrop karena tidak terlalu digunakan dalam sistem rekomendasi

In [15]:
ratings = ratings.\
    withColumn('userId', col('userId').cast('integer')).\
    withColumn('movieId', col('movieId').cast('integer')).\
    withColumn('rating', col('rating').cast('float')).\
    drop('timestamp')

ratings.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



#### Mengalkulasikan banyaknya kemunculan data berdasarkan movieId

In [16]:
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
|    356|  329|
|    318|  317|
|    296|  307|
|    593|  279|
|   2571|  278|
|    260|  251|
|    480|  238|
|    110|  237|
|    589|  224|
|    527|  220|
|   2959|  218|
|      1|  215|
|   1196|  211|
|   2858|  204|
|     50|  204|
|     47|  203|
|    780|  202|
|    150|  201|
|   1198|  200|
|   4993|  198|
+-------+-----+
only showing top 20 rows



#### Mengalkulasikan banyaknya kemunculan data berdasarkan userId

In [17]:
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
|   414| 2698|
|   599| 2478|
|   474| 2108|
|   448| 1864|
|   274| 1346|
|   610| 1302|
|    68| 1260|
|   380| 1218|
|   606| 1115|
|   288| 1055|
|   249| 1046|
|   387| 1027|
|   182|  977|
|   307|  975|
|   603|  943|
|   298|  939|
|   177|  904|
|   318|  879|
|   232|  862|
|   480|  836|
+------+-----+
only showing top 20 rows



## Mengalkulasikan sparsity pada data

Sparsity adalah kondisi ketika seorang user hanya memiliki informasi yang sedikit sehingga dapat menyebabkan sistem rekomendasi menjadi tidak terlalu akurat. Sparse data terjadi ketika value yang terdapat pada dataset banyak bernilai 0

In [18]:
# fungsi untuk mengalkulasikan sparsity:  (1 - (Totalnonzero/total_elements))*100

def calc_sparsity(ratings):
    # menghitung jumlah data pada dataset rating
    nonzero=ratings.select("rating").count()
    
    # mengihtung total elemen pada userId dan movieId
    num_user=ratings.select("userId").distinct().count()
    num_movies=ratings.select("movieId").distinct().count()
    total_element=num_user*num_movies
    
    #menghitung sparsity pada dataset
    sparsity=(1.0-(nonzero)/total_element)*100
    print("Dataframe rating memiliki persentase sparsity sebesar","%.2f" %sparsity)

In [19]:
calc_sparsity(ratings)

Dataframe rating memiliki persentase sparsity sebesar 98.30


## Membuat model ALS

#### Mengimport library

In [20]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#### Memisahkan data untuk training dan testing

In [21]:
# Membuat data untuk train dan test 
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

#### Membuat model ALS

In [22]:
# Membuat model ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")

#### Menambahkan Hyperparameter dan Mendenisikan Evaluator

In [23]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()
            #             .addGrid(als.maxIter, [5, 50, 100, 200]) \

           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") #rmse dipilih karena 
print("Jumlah model yang diuji: ", len(param_grid))

Jumlah model yang diuji:  16


#### Membuat Cross Validation Pipeline

In [24]:
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

#### Menentukan Model Terbaik

In [25]:
#Fitting Cross validator ke dataset training
model = cv.fit(train)

#Mengekstraksi Model Terbaik
best_model = model.bestModel

In [26]:
# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 50
  MaxIter: 10
  RegParam: 0.15


In [27]:
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

0.8690354472727013


In [28]:
test_predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   580|   1580|   4.0| 3.5146918|
|   580|  44022|   3.5| 3.3116875|
|   597|    471|   2.0|  4.139512|
|   108|   1959|   5.0| 3.7737808|
|   368|   2122|   2.0| 2.0152385|
|   436|    471|   3.0| 3.5678365|
|   587|   1580|   4.0| 3.8583906|
|    27|   1580|   3.0|  3.360027|
|   606|   1580|   2.5|  3.162763|
|   606|  44022|   4.0| 2.8372734|
|    91|   2122|   4.0| 2.3410552|
|   157|   3175|   2.0| 3.5833976|
|   232|   1580|   3.5|  3.381064|
|   232|  44022|   3.0| 3.1442473|
|   246|   1645|   4.0| 3.7344043|
|   599|   2366|   3.0| 2.9078147|
|   111|   1088|   3.0| 3.3048038|
|   111|   3175|   3.5|  3.147515|
|    47|   1580|   1.5| 2.7201254|
|   140|   1580|   3.0| 3.3239927|
+------+-------+------+----------+
only showing top 20 rows



#### Membuat Rekomendasi Film berdasarkan Model Terbaik

In [29]:
nrecommendations = best_model.recommendForAllUsers(15) #method rekomendasiForAllUsers itu fungsinya gimana 
nrecommendations.limit(10).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3379, 5.7612557...|
|     3|[{5746, 4.858056}...|
|     5|[{3379, 4.555614}...|
|     6|[{3925, 4.827505}...|
|    12|[{45503, 5.696589...|
|    13|[{3379, 5.04909},...|
|    15|[{60943, 4.460198...|
|    16|[{3379, 4.6488113...|
|    19|[{3379, 3.9579597...|
|    20|[{3379, 4.9395165...|
+------+--------------------+



In [30]:
nrecommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{3379, 5.7612557...|
|     3|[{5746, 4.858056}...|
|     5|[{3379, 4.555614}...|
|     6|[{3925, 4.827505}...|
|    12|[{45503, 5.696589...|
|    13|[{3379, 5.04909},...|
|    15|[{60943, 4.460198...|
|    16|[{3379, 4.6488113...|
|    19|[{3379, 3.9579597...|
|    20|[{3379, 4.9395165...|
|    22|[{68536, 3.876325...|
|    26|[{3379, 4.5175786...|
|    27|[{1939, 4.68301},...|
|    28|[{60943, 4.270738...|
|    31|[{33649, 5.225558...|
|    34|[{33649, 4.707149...|
|    37|[{3379, 5.352682}...|
|    40|[{3379, 5.3208714...|
|    41|[{116897, 4.32023...|
|    43|[{33649, 5.640554...|
+------+--------------------+
only showing top 20 rows



In [31]:
nrecommendations = nrecommendations.withColumn("rec_exp", explode("recommendations"))
nrecommendations.show()

+------+--------------------+-------------------+
|userId|     recommendations|            rec_exp|
+------+--------------------+-------------------+
|     1|[{3379, 5.7612557...|  {3379, 5.7612557}|
|     1|[{3379, 5.7612557...| {33649, 5.6169276}|
|     1|[{3379, 5.7612557...|   {5490, 5.517828}|
|     1|[{3379, 5.7612557...| {171495, 5.425299}|
|     1|[{3379, 5.7612557...|  {3951, 5.4084063}|
|     1|[{3379, 5.7612557...|  {5328, 5.4084063}|
|     1|[{3379, 5.7612557...|  {5416, 5.4084063}|
|     1|[{3379, 5.7612557...| {78836, 5.3639174}|
|     1|[{3379, 5.7612557...|    {8477, 5.34965}|
|     1|[{3379, 5.7612557...|   {5915, 5.335429}|
|     1|[{3379, 5.7612557...|   {6460, 5.334323}|
|     1|[{3379, 5.7612557...|{179135, 5.3292713}|
|     1|[{3379, 5.7612557...| {84273, 5.3292713}|
|     1|[{3379, 5.7612557...|  {7071, 5.3292713}|
|     1|[{3379, 5.7612557...| {26073, 5.3292713}|
|     3|[{5746, 4.858056}...|   {5746, 4.858056}|
|     3|[{5746, 4.858056}...|   {6835, 4.858056}|


In [32]:
nrecommendations = nrecommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))

nrecommendations.limit(10).show()

+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|     1|   3379|5.7612557|
|     1|  33649|5.6169276|
|     1|   5490| 5.517828|
|     1| 171495| 5.425299|
|     1|   3951|5.4084063|
|     1|   5328|5.4084063|
|     1|   5416|5.4084063|
|     1|  78836|5.3639174|
|     1|   8477|  5.34965|
|     1|   5915| 5.335429|
+------+-------+---------+



In [33]:
nrecommendations.join(movies, on='movieId').filter('userId = 100').show()

+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|              genres|
+-------+------+---------+--------------------+--------------------+
|  67618|   100| 5.099819|Strictly Sexual (...|Comedy|Drama|Romance|
|  33649|   100| 5.093817|  Saving Face (2004)|Comedy|Drama|Romance|
|   3379|   100| 5.056979| On the Beach (1959)|               Drama|
|  42730|   100| 4.949055|   Glory Road (2006)|               Drama|
|  74282|   100|4.9183598|Anne of Green Gab...|Children|Drama|Ro...|
|  86237|   100|4.9070096|  Connections (1978)|         Documentary|
|   7071|   100|4.9070096|Woman Under the I...|               Drama|
|   4708|   100|4.9070096|   Marat/Sade (1966)|       Drama|Musical|
| 179135|   100|4.9070096|Blue Planet II (2...|         Documentary|
|  26073|   100|4.9070096|Human Condition I...|           Drama|War|
|  74226|   100|4.9070096|Dream of Light (a...|   Documentary|Drama|
|  84273|   100|4.9070096|Zeitgeis

In [34]:
ratings.join(movies, on='movieId').filter('userId = 100').sort('rating', ascending=False).limit(10).show()

+-------+------+------+--------------------+--------------------+
|movieId|userId|rating|               title|              genres|
+-------+------+------+--------------------+--------------------+
|   1101|   100|   5.0|      Top Gun (1986)|      Action|Romance|
|   1958|   100|   5.0|Terms of Endearme...|        Comedy|Drama|
|   2423|   100|   5.0|Christmas Vacatio...|              Comedy|
|   4041|   100|   5.0|Officer and a Gen...|       Drama|Romance|
|   5620|   100|   5.0|Sweet Home Alabam...|      Comedy|Romance|
|    368|   100|   4.5|     Maverick (1994)|Adventure|Comedy|...|
|    934|   100|   4.5|Father of the Bri...|              Comedy|
|    539|   100|   4.5|Sleepless in Seat...|Comedy|Drama|Romance|
|     16|   100|   4.5|       Casino (1995)|         Crime|Drama|
|    553|   100|   4.5|    Tombstone (1993)|Action|Drama|Western|
+-------+------+------+--------------------+--------------------+

