In [37]:
# importing all the libraries 
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions  import *
from pyspark.sql.types import *


In [38]:

# define the configurations 
conf = SparkConf().setMaster("local[*]").setAppName("Books")
conf.set("spark.executor.memory", "6G")
conf.set("spark.driver.memory", "2G")
conf.set("spark.executor.cores", "4")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.default.parallelism", "4")


<pyspark.conf.SparkConf at 0x7f16b7352340>

creating a Spark Session and not a Spark Context

In [39]:


spark = SparkSession.builder.config(conf = conf).appName("spark session").getOrCreate()

Loading Data in dataFrame

In [40]:
books = spark.read.option("delimiter",";").option("header", "true").csv('gs://hdeshpa-final-storage/BX-Books.csv')
ratings = spark.read.option("delimiter",";").option("header", "true").csv('gs://hdeshpa-final-storage/BX-Book-Ratings.csv')
user = spark.read.option("delimiter",";").option("header", "true").csv('gs://hdeshpa-final-storage/BX-Users.csv')
books.show()

+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari

In [41]:

ratings.show()

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
| 276733|2080674722|          0|
| 276736|3257224281|          8|
| 276737|0600570967|          6|
| 276744|038550120X|          7|
| 276745| 342310538|         10|
| 276746|0425115801|          0|
| 276746|0449006522|          0|
| 276746|0553561618|          0|
| 276746|055356451X|          0|
| 276746|0786013990|          0|
| 276746|0786014512|          0|
| 276747|0060517794|          9|
| 276747|0451192001|          0|
| 276747|0609801279|          0|
| 276747|0671537458|          9|
+-------+----------+-----------+
only showing top 20 rows



In [42]:

user.show()

+-------+--------------------+----+
|User-ID|            Location| Age|
+-------+--------------------+----+
|      1|  nyc, new york, usa|NULL|
|      2|stockton, califor...|  18|
|      3|moscow, yukon ter...|NULL|
|      4|porto, v.n.gaia, ...|  17|
|      5|farnborough, hant...|NULL|
|      6|santa monica, cal...|  61|
|      7| washington, dc, usa|NULL|
|      8|timmins, ontario,...|NULL|
|      9|germantown, tenne...|NULL|
|     10|albacete, wiscons...|  26|
|     11|melbourne, victor...|  14|
|     12|fort bragg, calif...|NULL|
|     13|barcelona, barcel...|  26|
|     14|mediapolis, iowa,...|NULL|
|     15|calgary, alberta,...|NULL|
|     16|albuquerque, new ...|NULL|
|     17|chesapeake, virgi...|NULL|
|     18|rio de janeiro, r...|  25|
|     19|           weston, ,|  14|
|     20|langhorne, pennsy...|  19|
+-------+--------------------+----+
only showing top 20 rows



Spark SQL

In [43]:
books.select('Publisher').distinct().count()

                                                                                

16807

Changing the data type to integer for columns in rating Dataframe

In [44]:
rating = ratings.withColumn("User-ID",
                                        ratings['User-ID'].cast(IntegerType())).\
                                        withColumn("ISBN", ratings['ISBN'].cast(IntegerType())).\
                                        withColumn("Book-Rating",ratings['Book-Rating'].cast(IntegerType())).\
                                        na.drop()


In [45]:
rating.show()

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276726| 155061224|          5|
| 276727| 446520802|          0|
| 276729| 521795028|          6|
| 276733|2080674722|          0|
| 276737| 600570967|          6|
| 276745| 342310538|         10|
| 276746| 425115801|          0|
| 276746| 449006522|          0|
| 276746| 553561618|          0|
| 276746| 786013990|          0|
| 276746| 786014512|          0|
| 276747|  60517794|          9|
| 276747| 451192001|          0|
| 276747| 609801279|          0|
| 276747| 671537458|          9|
| 276747| 679776818|          8|
| 276747| 943066433|          7|
| 276747|1570231028|          0|
| 276747|1885408226|          7|
| 276748| 747558167|          6|
+-------+----------+-----------+
only showing top 20 rows



Fitting a recommendation Model

In [46]:
als = ALS(maxIter=10, regParam=0.01, userCol="User-ID", itemCol="ISBN", ratingCol="Book-Rating",coldStartStrategy="drop")
#fit the model 
model = als.fit(rating)


                                                                                

In [47]:
user_id = 276688

In [48]:
final = rating.filter(col('User-ID')==user_id)
books.join(final,final.ISBN==books.ISBN).\
                select(col('User-ID'),col('Book-Title'),col('Book-Author'),col('Year-Of-Publication'),col('Book-Rating')).\
                show()

                                                                                

+-------+--------------------+------------------+-------------------+-----------+
|User-ID|          Book-Title|       Book-Author|Year-Of-Publication|Book-Rating|
+-------+--------------------+------------------+-------------------+-----------+
| 276688|          Sick Puppy|      Carl Hiaasen|               2001|          0|
| 276688|   Midwives: A Novel|   Chris Bohjalian|               1998|          0|
| 276688|Billy Straight: A...|Jonathan Kellerman|               1999|          0|
| 276688|Molly Ivins Can't...|       Molly Ivins|               1992|          0|
| 276688|Stalker: A Novel ...|    Faye Kellerman|               2000|          8|
| 276688|Midnight in the G...|      John Berendt|               1999|          0|
| 276688|In Pursuit of the...|  Elizabeth George|               2000|          6|
| 276688|          Paper Doll|  Robert B. Parker|               1996|          0|
| 276688|             The Web|Jonathan Kellerman|               1996|          0|
| 276688|     Th

In [50]:

# converting this into a dataframe 
df = sc.parallelize([[user_id]]).toDF(['User-ID'])
num_rec = 10
recommend = model.recommendForUserSubset(df , num_rec)
recommend.collect()
# Just taking only ISBN
recommend_ISBN = [recommend.collect()[0]['recommendations'][x]['ISBN'] for x in range(0,num_rec)]
recommend_ISBN

[312989423,
 696004550,
 670880434,
 883510022,
 809428504,
 195010434,
 141309377,
 231129505,
 764504959,
 671867679]

Showing the Recommendations by joining with Book df to get the Book details from recomended ISBN's

In [52]:

recom_df = spark.createDataFrame(recommend_ISBN, IntegerType())
print('Top ',num_rec,' book recommendations for User-ID ',user_id, ' are:')
books.join(recom_df,recom_df.value==books.ISBN).select(col('Book-Title'),col('Book-Author'),col('Year-Of-Publication')).show()


Top  10  book recommendations for User-ID  276688  are:


                                                                                

+--------------------+--------------------+-------------------+
|          Book-Title|         Book-Author|Year-Of-Publication|
+--------------------+--------------------+-------------------+
|Hollywood Tough :...|  Stephen J. Cannell|               2003|
|Do You Remember t...|Sally Hobart Alex...|               2000|
|Barbecuing the We...|      Carol D. Brent|               1980|
|Poultry (The Good...|   Time-Life Editors|               1979|
|Marcelino Pan Y Vino|  J.M. Sanchez-Silva|               1940|
|Tales of Mystery ...|     Edgar Allan Poe|               2000|
|Autobiography of ...|         Sayo Masuda|               2003|
|The iMac for Dummies|         David Pogue|               1998|
|Betty Crocker's N...|       Betty Crocker|               1993|
+--------------------+--------------------+-------------------+

