In [292]:
import pandas as pd

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext

from pyspark.sql.types import StringType, IntegerType

In [4]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [5]:
comic_reviews = spark.read.json('data/all_comic_reviews_in_mtv.json')

In [6]:
movie_reviews = spark.read.json('data/all_movietv_jsons/*.json')

In [8]:
comic_reviews.show(1)

+----------+-------+--------------------+--------------+
|      asin|overall|          reviewText|    reviewerID|
+----------+-------+--------------------+--------------+
|0316107255|    4.0|PENGUIN DREAMS AN...|A3NQU1649SH0Q4|
+----------+-------+--------------------+--------------+
only showing top 1 row



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [46]:
def new_id_dictionary(df, column, suffix_val):
    """Take in column with unique indexes, return dictionary with new index values. This is done to
     remove the default ASIN and user ID from Amazon reviews and create better unique ids.
    Args:
        df: source dataframe
        column: name of column with ids to replace
        suffix_val: new suffix value for unique codes. Example: all new user_ids could end
        with '00000'
    Returns:
        new_id_dict: New Spark dataframe with column of new unique ids
    
    """
    unique_vals = list(set([old_id[0] for old_id in df.select(column).collect()]))
    new_ids = [(str(i) + suffix_val) for i in range(1,len(unique_vals)+1)]
    new_id_dict = {k:v for k,v in zip(unique_vals, new_ids)}
    return new_id_dict

In [82]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '0000')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '2222')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '4444')

In [231]:
all_items_asins = new_comic_asins
all_items_asins = all_items_asins.update(new_mtv_asins)

In [36]:
def add_new_id(old_id, new_id_dict):
    new_id = id_dict[old_id]
    return new_id

In [96]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())

udfComicId = F.udf(lambda x: new_comic_asins[x], StringType())
udfMovieId = F.udf(lambda x: new_mtv_asins[x], StringType())

In [97]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfComicId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [98]:
comic_r_final = comic_reviews_updated.select(['user_id','item_id', 'overall'])

In [99]:
comic_r_final.show(1)

+--------+--------+-------+
| user_id| item_id|overall|
+--------+--------+-------+
|34510000|40102222|    4.0|
+--------+--------+-------+
only showing top 1 row



In [106]:
movie_reviews_updated = movie_reviews.withColumn("user_id", udfUserId("reviewerID"))
movie_reviews_updated_2 = movie_reviews_updated.withColumn("item_id", udfMovieId("asin"))

In [107]:
movies_r_final = movie_reviews_updated_2.select(['user_id', 'item_id', 'overall'])

In [108]:
movies_r_final.show(1)

+--------+---------+-------+
| user_id|  item_id|overall|
+--------+---------+-------+
|79890000|392924444|    4.0|
+--------+---------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

In [197]:
query = """
SELECT 
    user_id, 
    COUNT(*) as count 
FROM 
    table 
GROUP BY user_id
ORDER BY count desc"""

In [198]:
comic_r_final.createOrReplaceTempView('table')
temp_user_view = spark.sql(query).toPandas()

In [202]:
temp_user_view.shape

(10366, 2)

In [201]:
temp_user_view[temp_user_view['count'] < temp_user_view['count'].mean()].shape

(8283, 2)

In [109]:
all_reviews = comic_r_final.union(movies_r_final)

In [117]:
all_reviews_als = all_reviews.select([col("user_id").cast(IntegerType()), col("item_id").cast(IntegerType()), col("overall")])

In [118]:
all_reviews_als.persist()

DataFrame[user_id: int, item_id: int, overall: double]

In [119]:
(train, test) = all_reviews_als.randomSplit([.8,.2])

In [206]:
# Build the recommendation model using ALS
als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [211]:
train_pred = als_model.transform(train)
test_pred = als_model.transform(test)
train_pred_dropna = train_pred.dropna()
test_pred_dropna = test_pred.dropna()

In [212]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_train = evaluator.evaluate(train_pred_dropna)
rmse_test = evaluator.evaluate(test_pred_dropna)
mae_train = evaluator_2.evaluate(train_pred_dropna)
mae_test = evaluator_2.evaluate(test_pred_dropna)
print(f"Train RMSE: {rmse_train}")
print(f"Test RMSE: {rmse_test}")
print(f"Train MAE: {mae_train}")
print(f"Test MAE: {mae_test}")

Train RMSE: 0.34925654622891644
Test RMSE: 1.2748690218854049
Train MAE: 0.23288830390420345
Test MAE: 0.9887515321072159


In [209]:
pred.show(10)

+---------+--------+-------+----------+
|  user_id| item_id|overall|prediction|
+---------+--------+-------+----------+
| 71860000| 2732222|    4.0| 1.4231931|
|  6190000| 2732222|    5.0|  3.699666|
| 98160000| 2732222|    5.0| 3.7960944|
|101490000| 2732222|    4.0|  4.645656|
| 21300000| 2922222|    5.0|  4.544185|
| 48820000| 2922222|    4.0|  3.974867|
| 88340000| 2922222|    5.0|  4.616192|
|100430000| 3202222|    5.0| 3.2581654|
| 67620000| 7324444|    4.0|       NaN|
| 36350000|10144444|    3.0|       NaN|
+---------+--------+-------+----------+
only showing top 10 rows



In [316]:
users = all_reviews_als.select(als.getUserCol()).distinct().limit(30)
userSubsetRecs = als_model.recommendForUserSubset(users, 50).toPandas()

In [317]:
userSubsetRecs.head()

Unnamed: 0,user_id,recommendations
0,8960000,"[(1264444, 7.6324334144592285), (226814444, 6...."
1,9240000,"[(35644444, 5.674941062927246), (146214444, 5...."
2,15530000,"[(242514444, 7.326582908630371), (120494444, 7..."
3,18270000,"[(130964444, 6.113489151000977), (388304444, 6..."
4,19790000,"[(109504444, 6.8123064041137695), (169934444, ..."


In [318]:
comic_user_test = [i[0] for i in userSubsetRecs.loc[1][1]]
comic_user_test = [i for i in comic_user_test if str(i).endswith('22') == True]
comic_user_test

[6122222,
 35962222,
 6542222,
 22302222,
 26002222,
 49082222,
 50172222,
 6252222,
 44212222,
 20342222]

In [319]:
query_reviews = """
SELECT
    user_id,
    item_id,
    overall
FROM 
    table
WHERE user_id == 9240000"""

In [320]:
all_reviews_als.createOrReplaceTempView('table')
temp_user_view = spark.sql(query_reviews).toPandas()

In [321]:
temp_user_view

Unnamed: 0,user_id,item_id,overall
0,9240000,35952222,5.0
1,9240000,345724444,4.0


In [322]:
reviewed = temp_user_view.item_id.tolist()

In [323]:
# Reviewed & recomended
recommended_asin = [key for key, val in new_comic_asins.items() if int(val) in comic_user_test]
reviewed_asin = [key for key, val in new_comic_asins.items() if int(val) in reviewed]

In [326]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)
comic_names.loc[comic_names['asin'].isin(recommended_asin), 'title']

292                                   Fables Encyclopedia
279     Thor Visionaries - Walt Simonson, Vol. 2 (Marv...
553     Essential Tomb of Dracula, Vol. 3 (Marvel Esse...
1247                      Ultimate Fantastic Four, Vol. 3
2975                   Black Widow: The Itsy-Bitsy Spider
3064                          Deadpool Classic - Volume 6
3567    S.H.I.E.L.D. by Jim Steranko: The Complete Col...
78                                                 Empire
516                            Penguin Revolution: VOL 02
669                          Penguin Revolution: Volume 3
Name: title, dtype: object

In [327]:
comic_names.loc[comic_names['asin'].isin(reviewed_asin), 'title']

1147    Amazing Spider-Man Omnibus, Vol. 1 (v. 1)
Name: title, dtype: object