Link Dataset : https://www.kaggle.com/datasets/ruchi798/bookcrossing-dataset

# Setup Environment

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("UAS")\
        .config('spark.ui.port', '5050')\
        .getOrCreate()

# Content Based Filtering

## Data Cleaning

In [None]:
from pyspark.sql.functions import concat, col, lit, lower, regexp_replace, monotonically_increasing_id
import re

books = spark.read.option("header", "true").option("delimiter",";").csv("BX_Books.csv")
books = books.drop("Image-URL-S", "Image-URL-M", "Image-URL-L")\
             .select("ISBN", concat(col("Book-Title"), lit(' '), col("Book-Author"), lit(' '), col("Publisher")).alias('sentences'))\
             .withColumn("sentences", lower("sentences"))\
             .withColumn("sentences", regexp_replace("sentences", "[^a-zA-Z\\s]", "  "))\
             .withColumn("sentences", regexp_replace("sentences", "\s+", " "))\
             .distinct()\
             .withColumn("id", monotonically_increasing_id())\
             .select("id","ISBN","sentences")\
             .na.drop()

## Stop Words (NLTK), Tokenizer, "Alphabet" Stop Remover (PySpark), TF-IDF, Normalizer

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Normalizer
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

nltk.download('stopwords')

def pipeline(dataframe):
    stop_words = stopwords.words('english')

    remove = dataframe.rdd.filter(lambda x: x[2] not in stop_words)
    stop = remove.toDF()

    tokenizer = Tokenizer(inputCol="sentences", outputCol="words")
    token = tokenizer.transform(stop)

    alphabet = list(string.ascii_lowercase)
    remover = StopWordsRemover(stopWords=alphabet, inputCol=tokenizer.getOutputCol(), outputCol="filtered")
    removed = remover.transform(token)

    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="tf")
    tf = hashingTF.transform(removed)

    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="feature").fit(tf)
    tfidf = idf.transform(tf)

    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="norm")
    data = normalizer.transform(tfidf)

    return data

[nltk_data] Downloading package stopwords to C:\Users\Ilham Mafani
[nltk_data]     Nadif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Fungsi Perhitungan Cosine Similarity

In [None]:
@F.udf(returnType=DoubleType())
def cos_sim(u, v):
    return float( u.dot(v) / (u.norm(2) * v.norm(2)) )

## Cari Buku Berdasarkan Keyword (Cosine Similarity)

In [None]:
def cari_kata(keyword, data):
    rdd = [(0, 0, keyword)]
    df = spark.createDataFrame(data=rdd, schema=["id","ISBN","sentences"])
    df =  df.withColumn("sentences", lower("sentences"))\
            .withColumn("sentences", regexp_replace("sentences", "[^a-zA-Z\\s]", "  "))\
            .withColumn("sentences", regexp_replace("sentences", "\s+", " "))

    dataframe = data.union(df).sort(col('id').asc())

    data = pipeline(dataframe)
    key = data.filter(col('id') == 0).limit(1)

    join = key.alias("x").join(data.alias("y"), col("y.id") > col("x.id"))
    combines = join.withColumn("cos_sim", cos_sim(col("x.feature"), col("y.feature"))).na.fill({"cos_sim": 0.0})
    sort = combines.select(col("y.id").alias("idA"), col("x.id").alias("idB"), col("cos_sim")).filter(col('cos_sim') != 0.0)

    merge_cosim = sort.alias("x").join(data.alias("y"), col("x.idA") == col("y.id")).select('sentences', 'cos_sim').sort(col("cos_sim").desc())

    rows = sort.count()
    print(f"Hasil pencarian sebanyak : {rows}")
    return merge_cosim.show(truncate=False)

In [None]:
cari_kata("king midas", books)

Hasil pencarian sebanyak : 107881
+----------------------------------------------------------------+------------------+
|sentences                                                       |cos_sim           |
+----------------------------------------------------------------+------------------+
|the dead zone stephen king signet book                          |0.9999999999999998|
|dead zone stephen king signet book                              |0.9966328527287398|
|the dead zone stephen king viking books                         |0.8049413742097962|
|dead zone stephen king lgf                                      |0.7868082232579019|
|the dead zone stephen king new amer library                     |0.7505685784376874|
|dead zone stephen king penguin putnam mass                      |0.7459572560018076|
|dead zone stephen king penguin putnam mass                      |0.7459572560018076|
|the dead zone stephen king sagebrush bound                      |0.7019140884475865|
|it r stephen king s

## Jaccard Distance

In [None]:
from pyspark.ml.feature import MinHashLSH, MinHashLSHModel
from pyspark.ml.linalg import Vector

def cari_kata_jac(keyword, data):
    rdd = [(0, 0, keyword)]
    df = spark.createDataFrame(data=rdd, schema=["id","ISBN","sentences"])
    df = df.withColumn("sentences", lower("sentences"))\
           .withColumn("sentences", regexp_replace("sentences", "[^a-zA-Z\\s]", "  "))\
           .withColumn("sentences", regexp_replace("sentences", "\s+", " "))

    dataframe = data.union(df).sort(col('id').asc())

    data = pipeline(dataframe)
    key = data.filter(col('id') == 0).limit(1)
    process = data.filter(col('id') != 0)

    mLSH = MinHashLSH(inputCol="norm", outputCol="hashes")
    model = mLSH.fit(data)

    distance = model.approxSimilarityJoin(key, process, 1.0, distCol="Jaccard_Distance").select(
            col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("Jaccard_Distance"))

    merge_jac = distance.alias("x").join(data.alias("y"), col("x.idB") == col("y.id")).select('sentences', 'Jaccard_Distance').sort(col("Jaccard_Distance").desc())

    rows = merge_jac.count()
    print(f"Hasil pencarian sebanyak : {rows}")
    return merge_jac.show(truncate=False)

## Cari Kata Berdasarkan Jaccard Distance

In [None]:
cari_kata_jac("test", books)

Hasil pencarian sebanyak : 569
+-----------------------------------------------------------------------------------------------------------------------+------------------+
|sentences                                                                                                              |Jaccard_Distance  |
+-----------------------------------------------------------------------------------------------------------------------+------------------+
|king midas with selected sentences in american sign language nathaniel hawthorne gallaudet university press            |0.95              |
|larry l king a writer s life in letters or reflections in a bloodshot eye larry l king texas christian university press|0.95              |
|a connecticut yankee in king arthur s court worlds best reading mark twain putnam pub group                            |0.95              |
|a connecticut yankee in king arthur s court oxford world s classics mark twain oxford university press                 |0.

# Collaborative Filtering (Model Based)

## Data Cleaning

In [None]:
from pyspark.sql.functions import col

ratings = spark.read.option("header", "true").option("delimiter",";").csv("BX-Book-Ratings.csv")
ratings = ratings.withColumnRenamed("Book-Rating","rating")\
                 .withColumnRenamed("User-ID","userId")\
                 .withColumn('userId', col('userId').cast('integer'))\
                 .withColumn('ISBN', col('ISBN').cast('integer'))\
                 .withColumn('rating', col('rating').cast('double'))

ratings = ratings.na.fill({"rating": 0.0}).na.drop()

## Splitting Data & Limit Data

In [None]:
train = ratings.limit(200000)
(training, test) = train.randomSplit([0.8, 0.2], seed = None)

## ALS Configuration & Tuning Cross Validator

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

als = ALS(userCol="userId", itemCol="ISBN", ratingCol="rating", coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

param_grid = ParamGridBuilder()\
                .addGrid(als.rank, [10, 50, 100, 150]) \
                .addGrid(als.regParam, [.01, .05, .1, .15]) \
                .addGrid(als.maxIter, [20]) \
                .build()

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

## Data Training

In [None]:
model = cv.fit(training)
best_model = model.bestModel

## Mencari Model Terbaik dari Tuning

In [None]:
print("Rank:", best_model._java_obj.parent().getRank())
print("MaxIter:", best_model._java_obj.parent().getMaxIter())
print("RegParam:", best_model._java_obj.parent().getRegParam())

Rank: 100
MaxIter: 20
RegParam: 0.15


## Menghitung Nilai Error (RMSE)

In [None]:
predictions = best_model.transform(test)
RMSE = evaluator.evaluate(predictions)
print("RMSE", RMSE)

RMSE 4.107358205351516


## Hasil Rekomendasi berdasarkan Users

In [None]:
user = best_model.recommendForAllUsers(10)
item = best_model.recommendForAllItems(10)

In [None]:
print("Item : ", item.count())
print("User : ", user.count())

Item :  81835
User :  16042


In [None]:
item.show(5)

+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ISBN |recommendations                                                                                                                                                                                     |
+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |[{24413, 11.008568}, {278465, 10.812599}, {52716, 10.115345}, {29205, 9.885187}, {49571, 9.725469}, {45465, 9.461636}, {10509, 9.354952}, {887, 9.23887}, {55413, 9.215725}, {15595, 8.97955}]      |
|2    |[{6107, 9.912249}, {6238, 8.901433}, {28451, 8.535229}, {50068, 8.480941}, {37227, 8.232894}, {14764, 8.217067}, {33933, 8.070509}, {29204, 7.8758807}, {28634, 7.843046}, {3

## Explode Data Rekomendasi

In [None]:
from pyspark.sql.functions import explode

users = user.withColumn("rec_exp", explode("recommendations"))\
            .select('userId', col("rec_exp.ISBN"), col("rec_exp.rating"))

items = item.withColumn("rec_exp", explode("recommendations"))\
            .select('ISBN', col("rec_exp.userId"), col("rec_exp.rating"))

## Mencari Rating Buku berdasarkan Average Rating Users

In [None]:
from pyspark.sql.functions import avg

rating_book = items.groupBy("ISBN").agg(avg("rating").alias("avg rating")).filter(col('avg rating') != 0.0)
rating_book.sort(col('avg rating').desc()).show(5)

+---------+------------------+
|     ISBN|        avg rating|
+---------+------------------+
|646471365| 17.88390235900879|
|689824580|16.826415157318117|
|552122866|15.983939743041992|
|394800168|15.235930347442627|
|310912520|15.203691387176514|
|440139791|15.203691387176514|
|451141725|14.906007385253906|
|552138223|14.143276023864747|
|840750803|13.856968784332276|
|451458125|13.738457202911377|
|440800129|13.683322429656982|
|553131834|13.683322429656982|
|395681863|13.683322429656982|
|553202790|13.683322429656982|
|804111774|13.575698471069336|
|451458923|13.571675395965576|
|440228204|13.543004703521728|
|440403782|13.458931255340577|
|310903335|13.452016735076905|
|887082769| 13.44827356338501|
+---------+------------------+
only showing top 20 rows



# Hybrid Filtering (Weighted)

## Mencari Buku berdasarkan Keyword, dan Rata-rata Rating Buku (ALS & Cosine Similarity)

In [None]:
def cari_kata_rating(keyword, data, rating):
    rdd = [(0, 0, keyword)]
    df = spark.createDataFrame(data=rdd, schema=["id","ISBN","sentences"])
    df =  df.withColumn("sentences", lower("sentences"))\
            .withColumn("sentences", regexp_replace("sentences", "[^a-zA-Z\\s]", "  "))\
            .withColumn("sentences", regexp_replace("sentences", "\s+", " "))

    dataframe = data.union(df).sort(col('id').asc())

    data = pipeline(dataframe)
    key = data.filter(col('id') == 0).limit(1)

    join = key.alias("x").join(data.alias("y"), col("y.id") > col("x.id"))
    combines = join.withColumn("cos_sim", cos_sim(col("x.feature"), col("y.feature"))).na.fill({"cos_sim": 0.0})
    sort = combines.select(col("y.id").alias("idA"), col("x.id").alias("idB"), col("cos_sim")).filter(col('cos_sim') != 0.0)

    merge_cosim = sort.alias("x").join(data.alias("y"), col("x.idA") == col("y.id")).select('ISBN', 'sentences', 'cos_sim')
    merge_rating = merge_cosim.alias("x").join(rating.alias("y"), col("x.ISBN") == col("y.ISBN"))
    rdd = merge_rating.rdd.map(lambda x: (x[0], x[1], x[2], x[4], (x[2]+(x[4]/20))/2))
    out = rdd.toDF(["ISBN", "Sentences", "Cos_Sim", "Avg_Rating", "Weighted"]).sort(col("Weighted").desc())

    rows = out.count()
    print(f"Hasil pencarian sebanyak : {rows}")
    return out.show(truncate=False)

In [None]:
cari_kata_rating("the dead zone stephen king signet book", books, rating_book)

Hasil pencarian sebanyak : 15343
+----------+-------------------------------------------------------------+-------------------+------------------+-------------------+
|ISBN      |Sentences                                                    |Cos_Sim            |Avg_Rating        |Weighted           |
+----------+-------------------------------------------------------------+-------------------+------------------+-------------------+
|0451126661|the dead zone stephen king signet book                       |1.0                |4.3194492816925045|0.6079862320423126 |
|0451159276|it r stephen king signet book                                |0.6127592427922383 |11.830597114562988|0.6021445492601938 |
|0451127927|dead zone stephen king penguin putnam mass                   |0.7458519229539675 |8.558069038391114 |0.5868776874367616 |
|0451169514|it stephen king signet book                                  |0.6127592427922383 |9.467936706542968 |0.5430780390596933 |
|0451160959|stand stephen kin

## Rekomendasi Buku berdasarkan User ID (ALS)

In [None]:
users.sort(col('rating').desc()).show(5)

+------+----------+---------+
|userId|      ISBN|   rating|
+------+----------+---------+
|278465| 646471365|20.757305|
| 34697|1858548268|20.141285|
| 40732| 440139791|20.078661|
| 40732| 310912520|20.078661|
| 19887| 440139791|19.816566|
| 19887| 310912520|19.816566|
| 32002| 307155781| 19.81422|
| 32002| 451165543| 19.81422|
| 32002| 375811370| 19.81422|
| 32002| 805015094| 19.81422|
| 32002| 316112321| 19.81422|
| 32002| 785341722| 19.81422|
| 32002| 740711660| 19.81422|
| 32002|1570640114| 19.81422|
| 32002| 307157881| 19.81422|
| 32002|1570640742| 19.81422|
| 24413| 646471365|19.474031|
|278465| 689824580|18.939238|
| 24413| 689824580|18.851898|
| 52716| 646471365|18.742237|
+------+----------+---------+
only showing top 20 rows



## Rekomendasi Buku berdasarkan Judul Buku dan Rekomendasi User (ALS & Cosine Similarity)

In [None]:
def rekomendasi_kunjungan(isbn, dataframe, rating, user):

    data = pipeline(dataframe)
    key = data.filter(col('ISBN') == isbn).limit(1)
    rating = rating.filter(col('userId') == user)

    join = key.alias("x").join(data.alias("y"), col("y.id") > col("x.id"))

    combines = join.withColumn("cos_sim", cos_sim(col("x.feature"), col("y.feature"))).na.fill({"cos_sim": 0.0})
    sort = combines.select(col("y.id").alias("idA"), col("x.id").alias("idB"), col("cos_sim")).filter(col('cos_sim') != 0.0).sort(col('cos_sim').desc())

    merge_cosim = sort.alias("x").join(data.alias("y"), col("x.idA") == col("y.id")).select('ISBN', 'sentences', 'cos_sim')

    merge_rating = merge_cosim.alias("x").join(rating.alias("y"), col("x.ISBN") == col("y.ISBN"))
    rdd = merge_rating.rdd.map(lambda x: (x[0], x[1], x[2], x[5], (x[2]+(x[5]/20))/2))
    out = rdd.toDF(["ISBN", "Sentences", "Cos_Sim", "Avg_Rating", "Weighted"]).sort(col("Weighted").desc())

    rows = key.rdd.collect()[0]['sentences']
    print(f"Judul buku yang sedang dikunjungi : {rows}")
    print(f"User yang mengunjungi : ", user)
    return out.sort(col('Weighted').desc()).show(truncate=False)

In [None]:
rekomendasi_kunjungan("1858548268", books, users, 34697)

Judul buku yang sedang dikunjungi : now i am big our new baby gillian davies autumn publishing ltd
User yang mengunjungi :  34697
+----------+-----------------------------------------------------------------+--------------------+-----------------+-------------------+
|ISBN      |Sentences                                                        |Cos_Sim             |Avg_Rating       |Weighted           |
+----------+-----------------------------------------------------------------+--------------------+-----------------+-------------------+
|0740704818|the blue day book bradley trevor greive andrews mcmeel publishing|0.022185946260666934|9.353647232055664|0.24493415393172507|
|0849908663|the body charles colson w publishing group                       |0.030341345916120675|8.646134376525879|0.23132403237120733|
+----------+-----------------------------------------------------------------+--------------------+-----------------+-------------------+



## Rekomendasi Buku Berdasarkan User ID (ALS)

In [None]:
def rekomendasi_user(user, dataframe, rating):
    rating = rating.filter(col('userId') == user)

    merge_cosim = rating.alias("x").join(dataframe.alias("y"), col("x.ISBN") == col("y.ISBN"))
    out =  merge_cosim.select('x.ISBN', 'rating', 'sentences').sort(col('rating'))

    print(f"User ID : ", user)
    return out.show(truncate=False)

In [None]:
rekomendasi_user(19887, books, users)

User ID :  19887
+---------+---------+-------------------------------------------------------------------------------------------------------+
|ISBN     |rating   |sentences                                                                                              |
+---------+---------+-------------------------------------------------------------------------------------------------------+
|375756817|15.853253|the adventures of tom sawyer modern library classics mark twain modern library                         |
|684835614|15.853253|the road less traveled and beyond spiritual growth in an age of anxiety m scott peck touchstone        |
|898151899|16.098257|white trash cooking jargon ernest matthew mickler ten speed press                                      |
|553202790|17.834908|dr atkins nutrition breakthrough how to robert c atkin bantam doubleday dell                           |
|553131834|17.834908|pearl john steinbeck bantam doubleday dell                                      