In [1]:
from pyspark.sql import *
import pyspark.sql.functions as F

In [2]:
appName = "training"
master = "local"

In [3]:
spark = SparkSession.builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

### read files

In [4]:
train = spark.read.parquet("data/processed/train.parquet/")
validation = spark.read.parquet("data/processed/validation.parquet/")
testing = spark.read.parquet("data/processed/testing.parquet/")

In [5]:
train.createOrReplaceTempView("train")
validation.createOrReplaceTempView("validation")
testing.createOrReplaceTempView("testing")

In [6]:
userids = spark.sql("SELECT DISTINCT user_id FROM train")

In [7]:
userids.show(5)

+-------+
|user_id|
+-------+
| 457311|
| 555859|
| 748190|
|  36583|
| 796717|
+-------+
only showing top 5 rows



### training

In [8]:
from pyspark.ml.recommendation import ALS, ALSModel

In [9]:
regParam = 0.1

In [10]:
als = ALS(rank=10, maxIter=5, seed=42, regParam=regParam,\
          userCol='user_id', itemCol='book_id',\
          ratingCol='rating', coldStartStrategy="drop")

In [11]:
model = als.fit(train)

In [12]:
TEMP_PATH = "models/ALS_{}_{}".format(model.rank, regParam)

In [13]:
ALS_PATH = TEMP_PATH + "/als"

In [14]:
als.save(ALS_PATH)

In [15]:
MODEL_PATH = TEMP_PATH + "/als_model"

In [16]:
model.save(MODEL_PATH)