In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("UAS")\
        .config('spark.ui.port', '5050')\
        .getOrCreate()

In [None]:
books = spark.read.option("header", "true").option("delimiter",";").csv("BX_Books.csv")

In [None]:
books = books.drop("Image-URL-S", "Image-URL-M", "Image-URL-L")

In [None]:
from pyspark.sql.functions import concat, col, lit
books = books.select("ISBN", concat(col("Book-Title"), lit(' '), col("Book-Author"), lit(' '), col("Publisher")).alias('sentences'))

In [None]:
books.show(3, truncate=False)

+----------+--------------------------------------------------------------+
|ISBN      |sentences                                                     |
+----------+--------------------------------------------------------------+
|0195153448|Classical Mythology Mark P. O. Morford Oxford University Press|
|0002005018|Clara Callan Richard Bruce Wright HarperFlamingo Canada       |
|0060973129|Decision in Normandy Carlo D'Este HarperPerennial             |
+----------+--------------------------------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import lower
books = books.withColumn("sentences", lower("sentences"))

In [None]:
books.show(3, truncate=False)

+----------+--------------------------------------------------------------+
|ISBN      |sentences                                                     |
+----------+--------------------------------------------------------------+
|0195153448|classical mythology mark p. o. morford oxford university press|
|0002005018|clara callan richard bruce wright harperflamingo canada       |
|0060973129|decision in normandy carlo d'este harperperennial             |
+----------+--------------------------------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import regexp_replace
import re

books = books.withColumn("sentences", regexp_replace("sentences", "[^a-zA-Z\\s]", "  "))\
             .withColumn("sentences", regexp_replace("sentences", "\s+", " "))\
             .distinct()

books.show(3, truncate=False)

+----------+--------------------------------------------------------------+
|ISBN      |sentences                                                     |
+----------+--------------------------------------------------------------+
|0451155750|the dead zone stephen king signet book                        |
|0375411992|blackwood farm the vampire chronicles anne rice alfred a knopf|
|081257060X|the return buzz aldrin tor books                              |
+----------+--------------------------------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import monotonically_increasing_id
books = books.withColumn("id", monotonically_increasing_id())
books = books.select(books.id, books.ISBN, books.sentences)

In [None]:
books.show(3,truncate=False)

+---+----------+--------------------------------------------------------------+
|id |ISBN      |sentences                                                     |
+---+----------+--------------------------------------------------------------+
|0  |0451155750|the dead zone stephen king signet book                        |
|1  |0375411992|blackwood farm the vampire chronicles anne rice alfred a knopf|
|2  |081257060X|the return buzz aldrin tor books                              |
+---+----------+--------------------------------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.sql.functions import col,isnan, when, count
books = books.na.drop()
books.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in books.columns]).show()
books.count()

+---+----+---------+
| id|ISBN|sentences|
+---+----+---------+
|  0|   0|        0|
+---+----+---------+



271379

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Ilham Mafani
[nltk_data]     Nadif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = stopwords.words('english')

In [None]:
remove = books.rdd.filter(lambda x: x[2] not in stop_words)
books = remove.toDF()

In [None]:
books.show(3, truncate=False)

+---+----------+--------------------------------------------------------------+
|id |ISBN      |sentences                                                     |
+---+----------+--------------------------------------------------------------+
|0  |0451155750|the dead zone stephen king signet book                        |
|1  |0375411992|blackwood farm the vampire chronicles anne rice alfred a knopf|
|2  |081257060X|the return buzz aldrin tor books                              |
+---+----------+--------------------------------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Normalizer
from pyspark.ml import Pipeline

In [None]:
tokenizer = Tokenizer(inputCol="sentences", outputCol="words")
tok = tokenizer.transform(books)
tok.select('words').show(3,truncate=False)

+-------------------------------------------------------------------------+
|words                                                                    |
+-------------------------------------------------------------------------+
|[the, dead, zone, stephen, king, signet, book]                           |
|[blackwood, farm, the, vampire, chronicles, anne, rice, alfred, a, knopf]|
|[the, return, buzz, aldrin, tor, books]                                  |
+-------------------------------------------------------------------------+
only showing top 3 rows



In [None]:
import string
alphabet = list(string.ascii_lowercase)
remover = StopWordsRemover(stopWords=alphabet, inputCol=tokenizer.getOutputCol(), outputCol="filtered")
removed = remover.transform(tok)
removed.select('filtered').show(3, truncate=False)

+----------------------------------------------------------------------+
|filtered                                                              |
+----------------------------------------------------------------------+
|[the, dead, zone, stephen, king, signet, book]                        |
|[blackwood, farm, the, vampire, chronicles, anne, rice, alfred, knopf]|
|[the, return, buzz, aldrin, tor, books]                               |
+----------------------------------------------------------------------+
only showing top 3 rows



In [None]:
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="tf")
tf = hashingTF.transform(removed)
tf.show(3)

+---+----------+--------------------+--------------------+--------------------+--------------------+
| id|      ISBN|           sentences|               words|            filtered|                  tf|
+---+----------+--------------------+--------------------+--------------------+--------------------+
|  0|0451155750|the dead zone ste...|[the, dead, zone,...|[the, dead, zone,...|(262144,[11275,19...|
|  1|0375411992|blackwood farm th...|[blackwood, farm,...|[blackwood, farm,...|(262144,[1857,300...|
|  2|081257060X|the return buzz a...|[the, return, buz...|[the, return, buz...|(262144,[33647,43...|
+---+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="feature").fit(tf)
tfidf = idf.transform(tf)
tfidf.show(3)

+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|      ISBN|           sentences|               words|            filtered|                  tf|             feature|
+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|0451155750|the dead zone ste...|[the, dead, zone,...|[the, dead, zone,...|(262144,[11275,19...|(262144,[11275,19...|
|  1|0375411992|blackwood farm th...|[blackwood, farm,...|[blackwood, farm,...|(262144,[1857,300...|(262144,[1857,300...|
|  2|081257060X|the return buzz a...|[the, return, buz...|[the, return, buz...|(262144,[33647,43...|(262144,[33647,43...|
+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="norm")
data = normalizer.transform(tfidf)

data.show(3)

+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|      ISBN|           sentences|               words|            filtered|                  tf|             feature|                norm|
+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|0451155750|the dead zone ste...|[the, dead, zone,...|[the, dead, zone,...|(262144,[11275,19...|(262144,[11275,19...|(262144,[11275,19...|
|  1|0375411992|blackwood farm th...|[blackwood, farm,...|[blackwood, farm,...|(262144,[1857,300...|(262144,[1857,300...|(262144,[1857,300...|
|  2|081257060X|the return buzz a...|[the, return, buz...|[the, return, buz...|(262144,[33647,43...|(262144,[33647,43...|(262144,[33647,43...|
+---+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+

In [None]:
key = data.limit(1)

In [None]:
process = data.filter(col('id') != 0)

In [None]:
process.count()

271378

In [None]:
from pyspark.sql import functions as F
rep = key.withColumn("id", F.explode(F.array_repeat("id", process.count())))
rep.select('id','sentences').show(3,truncate=False)

+---+--------------------------------------+
|id |sentences                             |
+---+--------------------------------------+
|0  |the dead zone stephen king signet book|
|0  |the dead zone stephen king signet book|
|0  |the dead zone stephen king signet book|
+---+--------------------------------------+
only showing top 3 rows



In [None]:
from pyspark.ml.feature import MinHashLSH, MinHashLSHModel
from pyspark.ml.linalg import Vector

mLSH = MinHashLSH(inputCol=normalizer.getOutputCol(), outputCol="hashes")
model = mLSH.fit(process)

model.transform(process).head()

Row(id=1, ISBN='0375411992', sentences='blackwood farm the vampire chronicles anne rice alfred a knopf', words=['blackwood', 'farm', 'the', 'vampire', 'chronicles', 'anne', 'rice', 'alfred', 'a', 'knopf'], filtered=['blackwood', 'farm', 'the', 'vampire', 'chronicles', 'anne', 'rice', 'alfred', 'knopf'], tf=SparseVector(262144, {1857: 1.0, 30006: 1.0, 72357: 1.0, 83990: 1.0, 95030: 1.0, 95889: 1.0, 218042: 1.0, 231043: 1.0, 234835: 1.0}), feature=SparseVector(262144, {1857: 5.5965, 30006: 6.3932, 72357: 5.9488, 83990: 8.9849, 95030: 4.8205, 95889: 1.0311, 218042: 5.5528, 231043: 6.4428, 234835: 7.056}), norm=SparseVector(262144, {1857: 0.3059, 30006: 0.3494, 72357: 0.3251, 83990: 0.4911, 95030: 0.2635, 95889: 0.0564, 218042: 0.3035, 231043: 0.3521, 234835: 0.3857}), hashes=[DenseVector([319442083.0])])

In [None]:
distance = model.approxSimilarityJoin(key, process, 1.0, distCol="Jaccard Distance").select(
            col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("Jaccard Distance"))

In [None]:
distance.sort(col('Jaccard Distance').desc()).show()

+---+------+------------------+
|idA|   idB|  Jaccard Distance|
+---+------+------------------+
|  0|125634|0.9545454545454546|
|  0| 64064|0.9523809523809523|
|  0|226270|0.9444444444444444|
|  0|219715|0.9444444444444444|
|  0|188464|0.9444444444444444|
|  0|187994|0.9444444444444444|
|  0| 90318|0.9411764705882353|
|  0|173632|0.9411764705882353|
|  0|202655|            0.9375|
|  0|148175|            0.9375|
|  0| 31656|            0.9375|
|  0| 92360|            0.9375|
|  0|219484|0.9285714285714286|
|  0| 30105|0.9285714285714286|
|  0|203650|0.9285714285714286|
|  0|208340|0.9285714285714286|
|  0|166389|0.9285714285714286|
|  0|187808|0.9285714285714286|
|  0|155230|0.9285714285714286|
|  0|265657|0.9285714285714286|
+---+------+------------------+
only showing top 20 rows



In [None]:
from pyspark.sql.types import DoubleType

@F.udf(returnType=DoubleType())
def cos_sim(u, v):
  return float( u.dot(v) / (u.norm(2) * v.norm(2)) )

In [None]:
join = key.alias("x").join(data.alias("y"), col("y.id") > col("x.id"))

In [None]:
combines = join.withColumn("cos_sim", cos_sim(col("x.feature"), col("y.feature"))).na.fill({"cos_sim": 0.0})

In [None]:
sort = combines.select(col("y.id").alias("idA"), col("x.id").alias("idB"), col("cos_sim")).filter(col('cos_sim') != 0.0).sort(col("cos_sim").desc())

In [None]:
sort.show()

+------+---+------------------+
|   idA|idB|           cos_sim|
+------+---+------------------+
|119881|  0|               1.0|
|270877|  0| 0.996632976935182|
| 52090|  0|0.8049474444492108|
|235664|  0|0.7868147417652676|
|219899|  0|0.7505757547261467|
| 23618|  0|0.7459646083656141|
| 47012|  0|0.7459646083656141|
|121179|  0|0.7019219054606646|
| 45468|  0|0.6126555112364798|
|132058|  0|0.6126555112364798|
|205423|  0|  0.55727520935378|
| 54176|  0|  0.55727520935378|
| 53644|  0|0.5310930391551143|
|151064|  0|0.5310930391551143|
| 27647|  0|0.5310930391551143|
| 36442|  0|0.5260695531149168|
|166189|  0|0.5246712644404072|
| 76611|  0|0.5246712644404072|
|232381|  0|0.5242939644307556|
| 36373|  0|0.5207578221088831|
+------+---+------------------+
only showing top 20 rows



In [None]:
data.filter(col('id') == 0).select('sentences').show(truncate=False)

+--------------------------------------+
|sentences                             |
+--------------------------------------+
|the dead zone stephen king signet book|
+--------------------------------------+



In [None]:
merge = process.alias("x").join(distance.alias("y"), col("x.id")==col("y.idB"))

In [None]:
merge.select('sentences', 'Jaccard Distance').sort(col('Jaccard Distance').desc()).show(truncate=False)

+--------------------------------------------------------------------------------------------------------+------------------+
|sentences                                                                                               |Jaccard Distance  |
+--------------------------------------------------------------------------------------------------------+------------------+
|get lost becka level start to read trade edition ser shirley simon school zone publishing company       |0.9545454545454546|
|jog frog jog level start to read library edition series barbara gregorich school zone publishing company|0.9523809523809523|
|danger zone hardy boys case file danger zone hardy boys casefiles no franklin w dixon simon pulse       |0.9444444444444444|
|big third and fourth grade not applicable na school zone publishing company                             |0.9444444444444444|
|philo fortune s awesome journey to his comfort zone julian f thompson disney pr                         |0.9444444444

In [None]:
merge_cosim = process.alias("x").join(sort.alias("y"), col("x.id") == col("y.idA"))

In [None]:
data.filter(col('id') == 0).select('sentences').show(truncate=False)

+--------------------------------------+
|sentences                             |
+--------------------------------------+
|the dead zone stephen king signet book|
+--------------------------------------+



In [None]:
merges = merge_cosim.select('sentences', 'cos_sim').sort(col('cos_sim').desc())

In [None]:
merges.show(truncate=False)

+----------------------------------------------------------------+------------------+
|sentences                                                       |cos_sim           |
+----------------------------------------------------------------+------------------+
|the dead zone stephen king signet book                          |1.0               |
|dead zone stephen king signet book                              |0.996632976935182 |
|the dead zone stephen king viking books                         |0.8049474444492108|
|dead zone stephen king lgf                                      |0.7868147417652676|
|the dead zone stephen king new amer library                     |0.7505757547261467|
|dead zone stephen king penguin putnam mass                      |0.7459646083656141|
|dead zone stephen king penguin putnam mass                      |0.7459646083656141|
|the dead zone stephen king sagebrush bound                      |0.7019219054606646|
|it r stephen king signet book                        

In [None]:
merge_cos_jac = distance.alias("x").join(sort.alias("y"), col("x.idB")==col("y.idA"))

In [None]:
merge_cos_jac.show()

+---+------+------------------+------+---+-------------------+
|idA|   idB|  Jaccard Distance|   idA|idB|            cos_sim|
+---+------+------------------+------+---+-------------------+
|  0|270877|0.1428571428571429|270877|  0|  0.996632976935182|
|  0| 63793|0.9047619047619048| 63793|  0|0.18093264137941115|
|  0|127325|0.9285714285714286|127325|  0| 0.3116162047385759|
|  0| 36442|0.6666666666666667| 36442|  0| 0.5260695531149168|
|  0|173147|0.9130434782608696|173147|  0|0.15455880905738226|
|  0|202655|            0.9375|202655|  0|0.19991466493064075|
|  0|255732|0.8571428571428572|255732|  0|0.20411406059660073|
|  0|262025|               0.7|262025|  0| 0.3846901483586561|
|  0|111215|             0.875|111215|  0|0.31092064911223105|
|  0| 92360|            0.9375| 92360|  0|0.13879752516086455|
|  0| 29442|0.8571428571428572| 29442|  0|  0.187793092698951|
|  0| 56948|0.8571428571428572| 56948|  0|0.17842693376785196|
|  0|  6717|0.9130434782608696|  6717|  0|0.16829616456

In [None]:
rdd = merge_cos_jac.rdd.map(lambda x: (x[1], (x[5] + x[2])/2))

In [None]:
conv = rdd.toDF(["id", "distance"])

In [None]:
conv.sort(col('distance').desc()).show()

+------+------------------+
|    id|          distance|
+------+------------------+
| 23618| 0.672982304182807|
| 47012| 0.672982304182807|
|235664|0.6434073708826338|
|219899|0.6252878773630733|
|127325|0.6200938166550023|
|219484|0.6128757871932898|
|265657|0.6127463944620797|
| 30519|0.6097730881484292|
| 57094|0.5997803628466483|
|219715|0.5975543398668348|
| 36442|0.5963681098907918|
|137714| 0.593581418899608|
|111215|0.5929603245561155|
|187808|0.5879022850691846|
|148175|0.5867589396998891|
|166389|0.5848463349461377|
| 73756|0.5832734458194512|
|152155| 0.582572935662197|
| 50634|0.5799151655000772|
|  2056|0.5787925259826765|
+------+------------------+
only showing top 20 rows



In [None]:
merge = process.alias("x").join(conv.alias("y"), col("x.id")==col("y.id"))

In [None]:
merge.select('sentences', 'distance').sort(col('distance').desc()).count()

99