Import libraries

In [17]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import StructType, IntegerType, FloatType, StringType
from pyspark.mllib.recommendation import Rating, ALS
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

Create SparkSession

In [2]:
spark = SparkSession.builder.appName('Book Ratings').getOrCreate()
spark

23/11/16 21:08:14 WARN Utils: Your hostname, Kyles-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.170.1.205 instead (on interface en0)
23/11/16 21:08:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/16 21:08:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Import Dataset into Spark DF

In [3]:
# Get underlying Spark Context
sc = spark.sparkContext

In [4]:
# define new schema
schema = StructType()\
        .add('bookID', IntegerType(), True)\
        .add('userID', StringType(), True)\
        .add('rating', FloatType(), True)\
        .add('timestamp', IntegerType(), True)

In [5]:
# Import data into PySpark DF
books = spark.read.format('csv').schema(schema).load('../data/Books.csv')

In [6]:
books.head(10)

[Row(bookID=1713353, userID='A1C6M8LCIX4M6M', rating=5.0, timestamp=1123804800),
 Row(bookID=1713353, userID='A1REUF3A1YCPHM', rating=5.0, timestamp=1112140800),
 Row(bookID=1713353, userID='A1YRBRK2XM5D5', rating=5.0, timestamp=1081036800),
 Row(bookID=1713353, userID='A1V8ZR5P78P4ZU', rating=5.0, timestamp=1077321600),
 Row(bookID=1713353, userID='A2ZB06582NXCIV', rating=5.0, timestamp=1475452800),
 Row(bookID=1713353, userID='ACPQVNRD3Z09X', rating=5.0, timestamp=1469750400),
 Row(bookID=1713353, userID='AVP0HXC9FG790', rating=5.0, timestamp=1466380800),
 Row(bookID=1713353, userID='A32MQTLQQN44WW', rating=5.0, timestamp=1461456000),
 Row(bookID=1713353, userID='A13CHIJPFCEP2M', rating=5.0, timestamp=1455408000),
 Row(bookID=1713353, userID='A324TTUBKTN73A', rating=5.0, timestamp=1453593600)]

In [7]:
# Verify data types
books.dtypes

[('bookID', 'int'),
 ('userID', 'string'),
 ('rating', 'float'),
 ('timestamp', 'int')]

The data is returned in the format `(bookID, userID, rating, timestamp)`. 

To parse into a `PySpark` `Rating` object, it is expected to be in the format `(int(userID), int(bookID), float(rating), int(timestamp))`.

### Convert Alphanumeric UserID to Int
For use in the collaborative filtering algorithm, the productID and userID must be of type `int`. 

To convert the alphanumeric UserIDs to integers, use `StringIndexer`.

In [8]:
# indexer = StringIndexer(inputCol='userID', outputCol='userID_int').fit(books)
# result = indexer.transform(books)

### Convert DF to RDD for Collaborative Filtering 

In [8]:
# Convert DF to RDD 
books_rdd = books.rdd.map(list)
books_rdd.first()

[1713353, 'A1C6M8LCIX4M6M', 5.0, 1123804800]

In [9]:
# Get all user IDs
user_ids = books_rdd.map(lambda x: x[1])
user_ids.first()

'A1C6M8LCIX4M6M'

In [10]:
# Get unique user IDs
user_ids = user_ids.distinct()
user_ids.first()




CodeCache: size=131072Kb used=15991Kb max_used=16003Kb free=115080Kb
 bounds [0x00000001069e0000, 0x00000001079a0000, 0x000000010e9e0000]
 total_blobs=6896 nmethods=5971 adapters=838
 compilation: disabled (not enough contiguous free space left)


                                                                                

'A32MQTLQQN44WW'

In [11]:
# map user IDs to unique Int ID
user_ids_mapped = user_ids.zipWithUniqueId().collectAsMap()

                                                                                

In [12]:
# broadcast to all nodes for efficient lookup
broadcasted_dict = sc.broadcast(user_ids_mapped)

In [14]:
# Transform to Mllib Rating object
ratings = books_rdd.map(lambda r: Rating(broadcasted_dict.value.get(r[1], None), r[0], r[2]))

In [15]:
ratings.first()

                                                                                

Rating(user=7, product=1713353, rating=5.0)

The data has now been formatted appropriately for the collaborative filtering algorithm:

rows of `Rating(int(user), int(product), float(rating))`

## Build Recommendation Model using ALS

In [None]:
# Define Model
rank = 10
numIterations = 10

# model = ALS.train(ratings, rank, numIterations)

23/11/16 22:57:50 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE