In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('recommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer,IndexToString

In [2]:
dfbooks=spark.read.csv(r"C:\Users\Sky\Videos\codes\book_recommend\Books.csv",inferSchema=True,header=True)
dfrate=spark.read.csv(r"C:\Users\Sky\Videos\codes\book_recommend\Ratings.csv",inferSchema=True,header=True)
dfusers=spark.read.csv(r"C:\Users\Sky\Videos\codes\book_recommend\Users.csv",inferSchema=True,header=True)

In [3]:
dfbooks.columns

['ISBN',
 'Book-Title',
 'Book-Author',
 'Year-Of-Publication',
 'Publisher',
 'Image-URL-S',
 'Image-URL-M',
 'Image-URL-L']

In [4]:
newbooks=dfbooks.select('ISBN','Book-Title')

In [5]:
newbooks.show(2)

+----------+-------------------+
|      ISBN|         Book-Title|
+----------+-------------------+
|0195153448|Classical Mythology|
|0002005018|       Clara Callan|
+----------+-------------------+
only showing top 2 rows



In [6]:
newbooks=newbooks.dropDuplicates()

In [7]:
newbooks.count()

271360

In [8]:
del(dfbooks)

In [9]:
dfrate.show(5)

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
+-------+----------+-----------+
only showing top 5 rows



In [10]:
newer=dfrate.join(newbooks,on='ISBN')

In [11]:
newer.show(10)

+----------+-------+-----------+--------------------+
|      ISBN|User-ID|Book-Rating|          Book-Title|
+----------+-------+-----------+--------------------+
|0000913154| 171118|          8|The Way Things Wo...|
|0001046438|  23902|          9|                Liar|
|0001046934| 206300|          0|The Prime of Miss...|
|0001047213|  23902|          9|    The Fighting Man|
|0001047647| 244994|          0|  First Among Equals|
|0001048473|  23902|          0|Nothing Can Be Be...|
|0001053744| 189835|          5| Pearl and Sir Orfeo|
|0001372564|  16319|          0|       Which Colour?|
|0001382381|  26583|          0|Huck Scarry's Ste...|
|0001711253| 156534|          0|  The Big Honey Hunt|
+----------+-------+-----------+--------------------+
only showing top 10 rows



In [12]:
df=newer.select('User-ID','Book-Title','Book-Rating')

In [13]:
df.groupBy('Book-Title').count().orderBy('count',ascending=False).show()

+--------------------+-----+
|          Book-Title|count|
+--------------------+-----+
|         Wild Animus| 2502|
|The Lovely Bones:...| 1295|
|   The Da Vinci Code|  898|
|     A Painted House|  838|
|The Nanny Diaries...|  828|
|Bridget Jones's D...|  815|
|The Secret Life o...|  774|
|Divine Secrets of...|  740|
|The Red Tent (Bes...|  723|
| Angels &amp; Demons|  670|
|          Life of Pi|  664|
|Snow Falling on C...|  662|
|         The Summons|  655|
|        The Notebook|  650|
|       The Testament|  617|
|House of Sand and...|  588|
|Where the Heart I...|  585|
|   The Pelican Brief|  581|
|Harry Potter and ...|  575|
|The Girls' Guide ...|  573|
+--------------------+-----+
only showing top 20 rows



In [14]:
df.groupBy('Book-Title').count().orderBy('count',ascending=True).show()

+--------------------+-----+
|          Book-Title|count|
+--------------------+-----+
|  Better in the Dark|    1|
|     Place Last Seen|    1|
|A Tale of a Tub a...|    1|
|The I.R.A (Fontan...|    1|
|From the Bluest P...|    1|
|Exploring Marketi...|    1|
|Black Hawk: An Au...|    1|
|Leadership When t...|    1|
|Black Holes: The ...|    1|
|Forensic Anthropo...|    1|
|Artificial Worlds...|    1|
|Windows NT Admini...|    1|
|Old friends, new ...|    1|
|"The \I Can't Bel...|    1|
|Little Bunny's Ma...|    1|
|The Harpercollins...|    1|
|"Baby Natasha In ...|    1|
|Religions of the ...|    1|
|KJV Classic Refer...|    1|
|The Complete Idio...|    1|
+--------------------+-----+
only showing top 20 rows



In [15]:
df.show(4)

+-------+--------------------+-----------+
|User-ID|          Book-Title|Book-Rating|
+-------+--------------------+-----------+
| 171118|The Way Things Wo...|          8|
|  23902|                Liar|          9|
| 206300|The Prime of Miss...|          0|
|  23902|    The Fighting Man|          9|
+-------+--------------------+-----------+
only showing top 4 rows



In [16]:
#convert the book title to numbers
indexed=StringIndexer(inputCol='Book-Title',outputCol='indexed').fit(df)

In [17]:
newdf=indexed.transform(df)

In [18]:
newdf.show(2)

+-------+--------------------+-----------+--------+
|User-ID|          Book-Title|Book-Rating| indexed|
+-------+--------------------+-----------+--------+
| 171118|The Way Things Wo...|          8|225978.0|
|  23902|                Liar|          9| 11117.0|
+-------+--------------------+-----------+--------+
only showing top 2 rows



In [84]:
rec=ALS(userCol='User-ID',ratingCol='Book-Rating',itemCol='indexed')

In [85]:
model=rec.fit(newdf)

In [86]:
new=model.transform(newdf)

In [87]:
new.show(3)

+-------+--------------------+-----------+--------+-----------+
|User-ID|          Book-Title|Book-Rating| indexed| prediction|
+-------+--------------------+-----------+--------+-----------+
|  23902|                Liar|          9| 11117.0|   8.957099|
|  23902|"T.S. Eliot Readi...|          6|113773.0|  6.1472406|
| 206300|The Prime of Miss...|          0| 10562.0|-0.50765455|
+-------+--------------------+-----------+--------+-----------+
only showing top 3 rows



In [25]:
from pyspark.ml.evaluation import RegressionEvaluator

In [88]:
evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='Book-Rating')

In [89]:
rmse=evaluator.evaluate(new)

In [90]:
print(rmse)

1.4338129909216222


In [39]:
unique_books=new.select('indexed').distinct()

In [40]:
unique_books.count()

205959

In [32]:
userid=23902

In [43]:
ans=new.filter(new['User-ID']!=userid).select('indexed').distinct()

In [44]:
ans.show(10)

+--------+
| indexed|
+--------+
|   305.0|
| 43284.0|
| 70352.0|
| 13607.0|
| 63392.0|
|160848.0|
| 10681.0|
| 25175.0|
| 35734.0|
| 22274.0|
+--------+
only showing top 10 rows



In [67]:
new_sol=ans.withColumn('User-ID',lit(userid))

In [66]:
new_sol.show()

+--------+------+
| indexed|userid|
+--------+------+
|   305.0| 23902|
| 43284.0| 23902|
| 70352.0| 23902|
| 13607.0| 23902|
| 63392.0| 23902|
|160848.0| 23902|
| 10681.0| 23902|
| 25175.0| 23902|
| 35734.0| 23902|
| 22274.0| 23902|
| 28134.0| 23902|
|   558.0| 23902|
| 28553.0| 23902|
|  7171.0| 23902|
| 62461.0| 23902|
| 23503.0| 23902|
| 40186.0| 23902|
| 24923.0| 23902|
| 39221.0| 23902|
| 21309.0| 23902|
+--------+------+
only showing top 20 rows



In [68]:
recommendation=model.transform(new_sol)

In [69]:
recommendation.show(5)

+--------+-------+------------+
| indexed|User-ID|  prediction|
+--------+-------+------------+
|   305.0|  23902|   7.1379986|
| 63392.0|  23902|         0.0|
|160848.0|  23902|    2.599407|
| 13607.0|  23902|-0.016468167|
| 43284.0|  23902|    3.588064|
+--------+-------+------------+
only showing top 5 rows



In [70]:
convert=IndexToString(inputCol='indexed',outputCol='title',labels=indexed.labels)

In [71]:
rec=convert.transform(recommendation)

In [74]:
rec.orderBy('prediction',ascending=False).show()

+--------+-------+----------+--------------------+
| indexed|User-ID|prediction|               title|
+--------+-------+----------+--------------------+
|109017.0|  23902| 18.522997|The children's pi...|
| 51102.0|  23902| 18.124266|A Promise to Reme...|
| 14308.0|  23902| 18.077991|       Deep Thoughts|
|106455.0|  23902| 16.476223|The Official Nora...|
| 45781.0|  23902| 16.366352|Poetry of Robert ...|
|  8710.0|  23902| 15.898792|              Catkin|
| 33772.0|  23902| 15.622518|         Fevre Dream|
| 38413.0|  23902| 15.570631|The Wounded Sky (...|
|  6606.0|  23902| 15.321466|Menopaws: The Sil...|
| 19604.0|  23902| 15.306931|            Unveiled|
| 20777.0|  23902| 15.235368|  If You Come Softly|
| 13765.0|  23902| 15.197332|           The Lorax|
| 38142.0|  23902| 15.150293|The Second Mark :...|
| 18922.0|  23902| 15.036689|Silent Spring: Ra...|
| 57990.0|  23902|15.0069065|I Dream a World: ...|
| 13101.0|  23902| 14.878612|  Foundation Trilogy|
| 42681.0|  23902| 14.844052|Ga