# ALS Recommender

Referenced ALS recommender created by Kevin Liao:
https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-2-alternating-least-square-als-matrix-4a76c58714a1

In [61]:
import re
import csv
import time
import gc
import math
import numpy as np

import findspark
# Find Spark Locally
location = findspark.find()
findspark.init(location, edit_rc=True)

import pyspark as ps    # for the pyspark suite
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType, TimestampType
import pyspark.sql.functions as F

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("anime recommender") \
            .getOrCreate()

sc = spark.sparkContext

from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lower
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

### Load data and initializing ALS model
---

In [62]:
path_anime = '../data/anime.csv'
path_ratings = '../data/rating.csv'

In [63]:
# anime_raw = sc.textFile(anime_filename)
# ratings_raw = sc.textFile(ratings_filename)
anime_raw = spark.read.load(path_anime,format='csv',header=True,inferSchema=True)
ratings_raw = spark.read.load(path_ratings,format='csv',header=True,inferSchema=True)

In [64]:
anime_df = anime_raw.select(['anime_id','name'])

In [65]:
ratings_df = ratings_raw.select(['user_id','anime_id','rating'])

In [66]:
# Define ALS model
model = ALS(
    userCol = 'user_id',
    itemCol = 'anime_id',
    ratingCol = 'rating',
    coldStartStrategy = 'drop'
)

In [67]:
# Split data for cross validation
train,val,test = ratings_df.randomSplit((0.6,0.2,0.2))

### Tuning Model

In [74]:
maxIter = 10
ranks = np.arange(5, 11, 1).tolist()
regParams = np.arange(.1,0.6,0.1).tolist()

In [75]:
tune_ALS(train,val,maxIter,regParams,ranks)

5 latent factors and regularization = 0.1: validation RMSE is 2.0776982112905453
5 latent factors and regularization = 0.2: validation RMSE is 2.0749591117108217
5 latent factors and regularization = 0.30000000000000004: validation RMSE is 2.0961722198129173
5 latent factors and regularization = 0.4: validation RMSE is 2.1367131855578982
5 latent factors and regularization = 0.5: validation RMSE is 2.1887355460098923
6 latent factors and regularization = 0.1: validation RMSE is 2.0785174797064765
6 latent factors and regularization = 0.2: validation RMSE is 2.073323400418736
6 latent factors and regularization = 0.30000000000000004: validation RMSE is 2.095198273667546
6 latent factors and regularization = 0.4: validation RMSE is 2.137606408015988
6 latent factors and regularization = 0.5: validation RMSE is 2.1891873357248675
7 latent factors and regularization = 0.1: validation RMSE is 2.0705172637251166
7 latent factors and regularization = 0.2: validation RMSE is 2.0623287078832218

ALSModel: uid=ALS_74b2afdffe87, rank=10

In [18]:
predictions = best_model.transform(test)

In [19]:
predictions = predictions.na.drop()
evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print('The out-of-sample RMSE of the best tuned model is:', rmse)

The out-of-sample RMSE of the best tuned model is: 2.055884801114346


Set model parameters:

In [9]:
max_iter = 10
reg = 0.05
rank = 10

In [10]:
model = ALS(userCol='user_id', itemCol='anime_id', rank=rank, maxIter=max_iter, regParam=reg)

### Prepare Inference Data

In [26]:
uid = ratings_df.agg({"user_id":"max"}).collect()[0][0]+1

In [27]:
uid

73517

In [12]:
fav_anime = "Naruto"

In [13]:
anime_df

DataFrame[anime_id: int, name: string]

In [14]:
# Match anime title w/ spark df query
matches = anime_df \
            .filter(
            lower(
                col('name')
            ).like('%{}%'.format(fav_anime.lower()))
        ) \
            .select('anime_id', 'name')

In [15]:
matches.collect()

[Row(anime_id=28755, name='Boruto: Naruto the Movie'),
 Row(anime_id=1735, name='Naruto: Shippuuden'),
 Row(anime_id=16870, name='The Last: Naruto the Movie'),
 Row(anime_id=13667, name='Naruto: Shippuuden Movie 6 - Road to Ninja'),
 Row(anime_id=20, name='Naruto'),
 Row(anime_id=32365, name='Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi'),
 Row(anime_id=10589, name='Naruto: Shippuuden Movie 5 - Blood Prison'),
 Row(anime_id=10075, name='Naruto x UT'),
 Row(anime_id=8246, name='Naruto: Shippuuden Movie 4 - The Lost Tower'),
 Row(anime_id=6325, name='Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono'),
 Row(anime_id=2472, name='Naruto: Shippuuden Movie 1'),
 Row(anime_id=4437, name='Naruto: Shippuuden Movie 2 - Kizuna'),
 Row(anime_id=4134, name='Naruto Shippuuden: Shippuu! &quot;Konoha Gakuen&quot; Den'),
 Row(anime_id=10686, name='Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!!'),
 Row(anime_id=12979, name='Naruto SD: Rock Lee no Seishun Full-Power Ninden'),
 Ro

In [16]:
matches[0]

Column<b'anime_id'>

In [17]:
# Grab all anime ids in wildcard matches query
ids = matches.rdd.map(lambda r: r[0]).collect()

In [18]:
ids

[28755,
 1735,
 16870,
 13667,
 20,
 32365,
 10589,
 10075,
 8246,
 6325,
 2472,
 4437,
 4134,
 10686,
 12979,
 19511,
 442,
 10659,
 936,
 2248,
 2144,
 7367,
 1074,
 594,
 761]

In [19]:
matched_titles = matches.rdd.map(lambda r: r[1]).collect()

In [20]:
matched_titles

['Boruto: Naruto the Movie',
 'Naruto: Shippuuden',
 'The Last: Naruto the Movie',
 'Naruto: Shippuuden Movie 6 - Road to Ninja',
 'Naruto',
 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi',
 'Naruto: Shippuuden Movie 5 - Blood Prison',
 'Naruto x UT',
 'Naruto: Shippuuden Movie 4 - The Lost Tower',
 'Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono',
 'Naruto: Shippuuden Movie 1',
 'Naruto: Shippuuden Movie 2 - Kizuna',
 'Naruto Shippuuden: Shippuu! &quot;Konoha Gakuen&quot; Den',
 'Naruto: Honoo no Chuunin Shiken! Naruto vs. Konohamaru!!',
 'Naruto SD: Rock Lee no Seishun Full-Power Ninden',
 'Naruto Shippuuden: Sunny Side Battle',
 'Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!',
 'Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!',
 'Naruto Movie 2: Dai Gekitotsu! Maboroshi no Chiteiiseki Dattebayo!',
 'Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! Special: Konoha Annual Sports Festival',
 'Naruto Movie 3: D

In [22]:
# Grab all anime ids not matched by input

other_ids = anime_df \
            .filter(~col('anime_id').isin(ids)) \
            .select(['anime_id']) \
            .rdd.map(lambda r: r[0]) \
            .collect()

In [23]:
len(other_ids)

12269

In [28]:
# Create inference data with anime data not matched
inference_raw = sc.parallelize(
            [(uid, anime_id) for anime_id in other_ids]
        ).map(
            lambda x: Row(
                user_id=int(x[0]),
                anime_id=int(x[1]),
            )
        )

In [29]:
inference_raw.collect()

[Row(user_id=73517, anime_id=32281),
 Row(user_id=73517, anime_id=5114),
 Row(user_id=73517, anime_id=28977),
 Row(user_id=73517, anime_id=9253),
 Row(user_id=73517, anime_id=9969),
 Row(user_id=73517, anime_id=32935),
 Row(user_id=73517, anime_id=11061),
 Row(user_id=73517, anime_id=820),
 Row(user_id=73517, anime_id=15335),
 Row(user_id=73517, anime_id=15417),
 Row(user_id=73517, anime_id=4181),
 Row(user_id=73517, anime_id=28851),
 Row(user_id=73517, anime_id=918),
 Row(user_id=73517, anime_id=2904),
 Row(user_id=73517, anime_id=28891),
 Row(user_id=73517, anime_id=199),
 Row(user_id=73517, anime_id=23273),
 Row(user_id=73517, anime_id=24701),
 Row(user_id=73517, anime_id=12355),
 Row(user_id=73517, anime_id=1575),
 Row(user_id=73517, anime_id=263),
 Row(user_id=73517, anime_id=44),
 Row(user_id=73517, anime_id=1),
 Row(user_id=73517, anime_id=30276),
 Row(user_id=73517, anime_id=164),
 Row(user_id=73517, anime_id=7311),
 Row(user_id=73517, anime_id=17074),
 Row(user_id=73517, anime

In [31]:
# Transform to spark df
inference_df = spark.createDataFrame(inference_raw) \
            .select(['user_id', 'anime_id'])

In [32]:
inference_df.collect()

[Row(user_id=73517, anime_id=32281),
 Row(user_id=73517, anime_id=5114),
 Row(user_id=73517, anime_id=28977),
 Row(user_id=73517, anime_id=9253),
 Row(user_id=73517, anime_id=9969),
 Row(user_id=73517, anime_id=32935),
 Row(user_id=73517, anime_id=11061),
 Row(user_id=73517, anime_id=820),
 Row(user_id=73517, anime_id=15335),
 Row(user_id=73517, anime_id=15417),
 Row(user_id=73517, anime_id=4181),
 Row(user_id=73517, anime_id=28851),
 Row(user_id=73517, anime_id=918),
 Row(user_id=73517, anime_id=2904),
 Row(user_id=73517, anime_id=28891),
 Row(user_id=73517, anime_id=199),
 Row(user_id=73517, anime_id=23273),
 Row(user_id=73517, anime_id=24701),
 Row(user_id=73517, anime_id=12355),
 Row(user_id=73517, anime_id=1575),
 Row(user_id=73517, anime_id=263),
 Row(user_id=73517, anime_id=44),
 Row(user_id=73517, anime_id=1),
 Row(user_id=73517, anime_id=30276),
 Row(user_id=73517, anime_id=164),
 Row(user_id=73517, anime_id=7311),
 Row(user_id=73517, anime_id=17074),
 Row(user_id=73517, anime

---

## Create new user profile with favorite anime

In [35]:
uid

73517

In [37]:
user_df = sc.parallelize([(uid,anime_id,10.0) for anime_id in ids])

In [38]:
user_df.collect()

[(73517, 28755, 10.0),
 (73517, 1735, 10.0),
 (73517, 16870, 10.0),
 (73517, 13667, 10.0),
 (73517, 20, 10.0),
 (73517, 32365, 10.0),
 (73517, 10589, 10.0),
 (73517, 10075, 10.0),
 (73517, 8246, 10.0),
 (73517, 6325, 10.0),
 (73517, 2472, 10.0),
 (73517, 4437, 10.0),
 (73517, 4134, 10.0),
 (73517, 10686, 10.0),
 (73517, 12979, 10.0),
 (73517, 19511, 10.0),
 (73517, 442, 10.0),
 (73517, 10659, 10.0),
 (73517, 936, 10.0),
 (73517, 2248, 10.0),
 (73517, 2144, 10.0),
 (73517, 7367, 10.0),
 (73517, 1074, 10.0),
 (73517, 594, 10.0),
 (73517, 761, 10.0)]

In [39]:
user_rows = user_df.map(
lambda x: Row(
                user_id=int(x[0]),
                anime_id=int(x[1]),
                rating=float(x[2])
            )
        )

In [41]:
# create user rows to add modified ratings for new user to main ratings df
user_rows.collect()

[Row(user_id=73517, anime_id=28755, rating=10.0),
 Row(user_id=73517, anime_id=1735, rating=10.0),
 Row(user_id=73517, anime_id=16870, rating=10.0),
 Row(user_id=73517, anime_id=13667, rating=10.0),
 Row(user_id=73517, anime_id=20, rating=10.0),
 Row(user_id=73517, anime_id=32365, rating=10.0),
 Row(user_id=73517, anime_id=10589, rating=10.0),
 Row(user_id=73517, anime_id=10075, rating=10.0),
 Row(user_id=73517, anime_id=8246, rating=10.0),
 Row(user_id=73517, anime_id=6325, rating=10.0),
 Row(user_id=73517, anime_id=2472, rating=10.0),
 Row(user_id=73517, anime_id=4437, rating=10.0),
 Row(user_id=73517, anime_id=4134, rating=10.0),
 Row(user_id=73517, anime_id=10686, rating=10.0),
 Row(user_id=73517, anime_id=12979, rating=10.0),
 Row(user_id=73517, anime_id=19511, rating=10.0),
 Row(user_id=73517, anime_id=442, rating=10.0),
 Row(user_id=73517, anime_id=10659, rating=10.0),
 Row(user_id=73517, anime_id=936, rating=10.0),
 Row(user_id=73517, anime_id=2248, rating=10.0),
 Row(user_id=7

In [42]:
user_df = spark.createDataFrame(user_rows) \
            .select(ratings_df.columns)

In [44]:
user_df.collect()

[Row(user_id=73517, anime_id=28755, rating=10.0),
 Row(user_id=73517, anime_id=1735, rating=10.0),
 Row(user_id=73517, anime_id=16870, rating=10.0),
 Row(user_id=73517, anime_id=13667, rating=10.0),
 Row(user_id=73517, anime_id=20, rating=10.0),
 Row(user_id=73517, anime_id=32365, rating=10.0),
 Row(user_id=73517, anime_id=10589, rating=10.0),
 Row(user_id=73517, anime_id=10075, rating=10.0),
 Row(user_id=73517, anime_id=8246, rating=10.0),
 Row(user_id=73517, anime_id=6325, rating=10.0),
 Row(user_id=73517, anime_id=2472, rating=10.0),
 Row(user_id=73517, anime_id=4437, rating=10.0),
 Row(user_id=73517, anime_id=4134, rating=10.0),
 Row(user_id=73517, anime_id=10686, rating=10.0),
 Row(user_id=73517, anime_id=12979, rating=10.0),
 Row(user_id=73517, anime_id=19511, rating=10.0),
 Row(user_id=73517, anime_id=442, rating=10.0),
 Row(user_id=73517, anime_id=10659, rating=10.0),
 Row(user_id=73517, anime_id=936, rating=10.0),
 Row(user_id=73517, anime_id=2248, rating=10.0),
 Row(user_id=7

In [45]:
ratings_df = ratings_df.union(user_df)

### Make Inference

In [33]:
model

ALS_1500b1c87346

In [46]:
als_model = model.fit(ratings_df)

In [49]:
results = als_model.transform(inference_df).select(['anime_id','prediction']).na.drop()

In [51]:
results.collect()

[Row(anime_id=148, prediction=5.782068252563477),
 Row(anime_id=463, prediction=7.337718963623047),
 Row(anime_id=471, prediction=6.77683687210083),
 Row(anime_id=496, prediction=4.129523277282715),
 Row(anime_id=833, prediction=6.650075435638428),
 Row(anime_id=1088, prediction=6.281349182128906),
 Row(anime_id=1238, prediction=6.970877647399902),
 Row(anime_id=1342, prediction=2.870173454284668),
 Row(anime_id=1580, prediction=5.034300804138184),
 Row(anime_id=1591, prediction=5.940197467803955),
 Row(anime_id=1645, prediction=7.795317649841309),
 Row(anime_id=1829, prediction=7.0644850730896),
 Row(anime_id=1959, prediction=2.4522228240966797),
 Row(anime_id=2122, prediction=7.493022441864014),
 Row(anime_id=2142, prediction=7.880992889404297),
 Row(anime_id=2366, prediction=5.646252155303955),
 Row(anime_id=2659, prediction=9.700439453125),
 Row(anime_id=2866, prediction=6.196117401123047),
 Row(anime_id=3175, prediction=8.665765762329102),
 Row(anime_id=3918, prediction=4.96647882

In [52]:
ordered_results = results \
            .orderBy('prediction', ascending=False) \
            .rdd.map(lambda r: (r[0], r[1])) \
            .take(10)

In [54]:
ordered_results

[(8375, 21.55990982055664),
 (30066, 18.51128387451172),
 (4331, 17.3941593170166),
 (31919, 17.26032257080078),
 (2769, 16.39604949951172),
 (2913, 15.29642105102539),
 (3252, 15.038765907287598),
 (3345, 14.7606782913208),
 (2634, 14.714110374450684),
 (30845, 14.701107025146484)]

In [56]:
anime_ids = [r[0] for r in ordered_results]
anime_ids

[8375, 30066, 4331, 31919, 2769, 2913, 3252, 3345, 2634, 30845]

In [58]:
rating_pred = [r[1] for r in ordered_results]
rating_pred

[21.55990982055664,
 18.51128387451172,
 17.3941593170166,
 17.26032257080078,
 16.39604949951172,
 15.29642105102539,
 15.038765907287598,
 14.7606782913208,
 14.714110374450684,
 14.701107025146484]

In [59]:
anime_titles = anime_df \
            .filter(col('anime_id').isin(anime_ids)) \
            .select('name') \
            .rdd.map(lambda r: r[0]) \
            .collect()

In [60]:
anime_titles

['Doraemon: Doraemon Comes Back',
 'Mikan Enikki',
 'Aikatsu! Music Award: Minna de Shou wo MoracchaimaShow!',
 'PriPara Movie: Mi~nna no Akogare♪ Let&#039;s Go☆Prix Paris',
 'Daisougen no Chiisana Tenshi: Bush Baby',
 'Galactic Patrol Lensman',
 'Chogattai Majutsu Robot Ginguiser',
 'Super Express Mazinger 7',
 'L/R: Licensed by Royal Special',
 'Penguin no Mondai']

In [70]:
def tune_ALS(train_data, validation_data, maxIter, regParams, ranks):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    model: spark ML model, ALS
    train_data: spark DF with columns ['userId', 'anime_id', 'rating']
    validation_data: spark DF with columns ['userId', 'anime_id', 'rating']
    maxIter: int, max number of learning iterations
    regParams: list of float, one dimension of hyper-param tuning grid
    ranks: list of float, one dimension of hyper-param tuning grid
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = math.inf
    best_rank = -1
    best_regularization = 0
    best_model = None

    for rank in ranks:
        for reg in regParams:
            # get ALS model
            # als = model.setMaxIter(maxIter).setRank(rank).setRegParam(reg)
            als = ALS(userCol='user_id', itemCol='anime_id', rank=rank, maxIter=maxIter, regParam=reg)
            # train ALS model
            model = als.fit(train_data)
            # evaluate the model by computing the RMSE on the validation data
            predictions = model.transform(validation_data)
            # drop na predictions
            predictions = predictions.na.drop()
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print('{} latent factors and regularization = {}: validation RMSE is {}'.format(rank, reg, rmse))
            if rmse < min_error:
                min_error = rmse
                best_rank = rank
                best_regularization = reg
                best_model = model

    print('\n The best model has {} latent factors and regularization = {}'.format(best_rank, best_regularization))
    return best_model

---