## Loading and indexing the data for training

In [83]:
reviews = (spark
    .read
    .json('./data/raw_data/reviews_Musical_Instruments_5.json.gz'))

In [84]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexing_pipeline = Pipeline(stages=[
    StringIndexer(inputCol="reviewerID", outputCol="reviewerIndex"),
    StringIndexer(inputCol="asin", outputCol="asinIndex")
])

indexed_reviews = indexing_pipeline.fit(reviews).transform(reviews)

In [110]:
(train, _, test) = indexed_reviews.randomSplit([.6, .2, .2], seed=1800009193L)

## Evaluator

In [122]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    predictionCol="prediction", 
    labelCol="overall", 
    metricName="rmse")

## Benchmark: predict by distribution

In [123]:
from pyspark.sql.functions import lit

average_rating = (train
    .groupBy()
    .avg('overall')
    .collect()[0][0])

average_rating_prediction = test.withColumn('prediction', lit(average_rating))

average_rating_evaluation = evaluator.evaluate(average_rating_prediction)

print('The RMSE of always predicting {0} stars is {1}'.format(average_rating, average_rating_evaluation))

The RMSE of always predicting 4.4999187124 stars is 0.894400260161


## Recommender system

In [111]:
from pyspark.ml.recommendation import ALS

als = ALS(
        maxIter=5,
        regParam=.1,
        userCol='reviewerIndex',
        itemCol='asinIndex',
        ratingCol='overall',
        rank=24,        
        seed=1800009193L)

## Evaluating the model

In [112]:
model = als.fit(train)

In [113]:
predictions = model.transform(test)

In [125]:
evaluation = evaluator.evaluate(predictions.filter(col('prediction') != float('nan')))

print('The RMSE of the recommender system is {0}'.format(evaluation))

The RMSE of the recommender system is 1.98658845486
