In [1]:
import re
import csv
import findspark
# Find Spark Locally
location = findspark.find()
findspark.init(location, edit_rc=True)

import pyspark as ps    # for the pyspark suite
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType, TimestampType
import pyspark.sql.functions as F

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("anime recommender") \
            .getOrCreate()

sc = spark.sparkContext

from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RegressionMetrics
import gc
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lower
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Load data
---

In [2]:
anime_filename = '../data/anime.csv'
ratings_filename = '../data/rating.csv'

In [3]:
# anime_raw = sc.textFile(anime_filename)
# ratings_raw = sc.textFile(ratings_filename)
anime_raw = spark.read.load(anime_filename,format='csv',header=True,inferSchema=True)
ratings_raw = spark.read.load(ratings_filename,format='csv',header=True,inferSchema=True)

In [4]:
anime_df = anime_raw.select(['anime_id','name'])

In [5]:
ratings_df = ratings_raw.select(['user_id','anime_id','rating'])

In [6]:
# Define ALS model
model = ALS(
    userCol = 'user_id',
    itemCol = 'anime_id',
    ratingCol = 'rating',
    coldStartStrategy = 'drop'
)

In [7]:
# Split data
train,val,test = ratings_df.randomSplit((0.6,0.2,0.2))

## Tuning ALS model
---

In [None]:
min_error = float('inf')
best_rank=-1
best_regularization=0
best_model=None

In [None]:
maxIter = 10
rank = 1
reg = 1

In [None]:
als = model.setMaxIter(maxIter).setRank(rank).setRegParam(reg)

In [None]:
model = als.fit(train)

In [None]:
predictions = model.transform(val)

In [None]:
evaluator = RegressionEvaluator(metricName='rmse',
                                        labelCol='rating',
                                        predictionCol='prediction')

In [None]:
rmse = evaluator.evaluate(predictions)

In [None]:
predictions.take(4)

In [None]:
rmse

In [None]:
predictions = model.transform(test)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

### Running Methods / ALS Fitting

#### Methods

In [12]:
def tune_ALS(model,training_data, validation_data, maxIter, regParams, ranks):
    
    min_error = float('inf')
    best_rank = -1
    best_regularization = 0
    best_model = None
    
    for rank in ranks:
        for reg in regParams:
            # get ALS model
            als = model.setMaxIter(maxIter).setRank(rank).setRegParam(reg)
            # train ALS model
            model = als.fit(train_data)
            # evaluate the model by computing the RMSE on the validation data
            predictions = model.transform(validation_data)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print('{} latent factors and regularization = {}: '
                  'validation RMSE is {}'.format(rank, reg, rmse))
            if rmse < min_error:
                min_error = rmse
                best_rank = rank
                best_regularization = reg
                best_model = model
    print('\nThe best model has {} latent factors and '
          'regularization = {}'.format(best_rank, best_regularization))
    return best_model    

In [None]:
def tune_model(maxIter,regParams,ranks,split_ratio=(0.6,0.2,0.2)):
    train, val, test = ratings_df.randomSplit(split_ratio)
    # tune model to get best model for predictions
    model = tune_ALS(model, train, val, maxIter, regParams, ranks)
    
    # test model
    predictions = self.model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print('The out-of-sample RMSE of the best tuned model is:', rmse)
    # clean up
    del train, val, test, predictions, evaluator
    gc.collect()

In [8]:
# Regex matching closest name to animes
def regex_matching(fav_anime):
    print('You have input anime:', fav_anime)
    matches_df = anime_df \
        .filter(
            lower(
                col('name')
            ).like('%{}%'.format(fav_anime.lower()))
        ) \
        .select('anime_id', 'name')
    if not len(matches_df.take(1)):
        print('Oops! No match is found')
    else:
        anime_ids = matches_df.rdd.map(lambda r: r[0]).collect()
        names = matches_df.rdd.map(lambda r: r[1]).collect()
        print('Found possible matches in our database: '
              '{0}\n'.format([x for x in names]))
        return anime_ids

In [9]:
# Append a user's anime ratings to ratings_df
def append_ratings(user_id,anime_ids):
     # create new user rdd
    user_rdd = self.sc.parallelize(
        [(user_id, anime_id, 5.0) for anime_id in anime_ids])
    # transform to user rows
    user_rows = user_rdd.map(
        lambda x: Row(
            user_id=int(x[0]),
            anime_id=int(x[1]),
            rating=float(x[2])
        )
    )
    # transform rows to spark DF
    user_df = spark.createDataFrame(user_rows) \
        .select(ratings_df.columns)
    # append to ratingsDF
    ratings_df = ratings_df.union(user_df)

In [10]:
def create_inference_data(user_id, anime_ids):
    """
    input:
        user_id: int
        anime_ids: list
        
    return:
        inference_df: dataframe
    """
    
    other_anime_ids = anime_df \
        .filter(~col('anime_id').isin(anime_ids)) \
        .select(['anime_id']) \
        .rdd.map(lambda r: r[0]) \
        .collect()
    
    # create inference rdd
    inference_rdd = sc.parallelize(
        [(user_id, anime_id) for anime_id in other_anime_ids]
    ).map(
        lambda x: Row(
            user_id=int(x[0]),
            anime_id=int(x[1]),
        )
    )
    # transform to inference DF
    inference_df = spark.createDataFrame(inference_rdd) \
        .select(['user_id', 'anime_id'])
    
    return inference_df

In [11]:
def make_inference(model,fav_anime,n_recommendations):
    # create a userId
    user_id = ratings_df.agg({"userId": "max"}).collect()[0][0] + 1
    # get movieIds of favorite movies
    anime_ids = regex_matching(fav_anime)
    # append new user with his/her ratings into data
    append_ratings(user_id, anime_ids)
    # matrix factorization
    model = model.fit(ratings_df)
    # get data for inferencing
    inference_df = create_inference_data(user_id, anime_ids)
    # make inference
    return model.transform(inference_df) \
        .select(['anime_id', 'prediction']) \
        .orderBy('prediction', ascending=False) \
        .rdd.map(lambda r: (r[0], r[1])) \
        .take(n_recommendations)

---

In [None]:
new_ratings = ratings_raw.filter(lambda line: line != header) \
            .map(lambda line: line.split(",")) \
            .map(lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2])))

In [None]:
anime_id, name, genre, type, episodes, rating, members = [ '{}'.format(x) for x in list(csv.reader([input_string], delimiter=',', quotechar='"'))[0] ]

In [None]:
header = ratings_RDD.take(1)[0]
        return ratings_RDD \
            .filter(lambda line: line != header) \
            .map(lambda line: line.split(",")) \
            .map(lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2])))

In [None]:
def clean_anime_data(input_string):
    anime_id, name, genre, type, episodes, rating, members = [ '{}'.format(x) for x in list(csv.reader([input_string], delimiter=',', quotechar='"'))[0] ]
    anime_id = int(anime_id)
    episodes = int(episodes)
    rating = float(rating)
    members = int(members)
    return [(anime_id, name, type,rating,members, token) for token in genre.split(',')]

In [None]:
anime_clean = anime_raw.flatMap(clean_anime_data)

In [None]:
print(anime_clean.take(10))

In [None]:
anime_schema = StructType( [
    StructField('anime_id',IntegerType(),True),
    StructField('name',StringType(),True),
    StructField('type',StringType(),True),
    StructField('rating',FloatType(),True),
    StructField('members',IntegerType(),True),
    StructField('genre',StringType(),True) ] )

anime = spark.createDataFrame(anime_clean, anime_schema)

In [None]:
anime

In [None]:
# pivot movie genres
anime = anime.groupBy("anime_id", "name", "type","rating","members")\
               .pivot("genre")\
               .agg(F.count(F.col('genre')))\
               .na.fill(0)

anime.show(5)
anime.printSchema()