In [1]:
import pandas as pd

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext

from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [3]:
comic_reviews = spark.read.json('data/all_comic_reviews_in_mtv.json')

In [4]:
movie_reviews = spark.read.json('data/all_movietv_jsons/*.json')

In [5]:
comic_reviews.show(1)

+----------+-------+--------------------+--------------+
|      asin|overall|          reviewText|    reviewerID|
+----------+-------+--------------------+--------------+
|0316107255|    4.0|PENGUIN DREAMS AN...|A3NQU1649SH0Q4|
+----------+-------+--------------------+--------------+
only showing top 1 row



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [6]:
def new_id_dictionary(df, column, suffix_val):
    """Take in column with unique indexes, return dictionary with new index values. This is done to
     remove the default ASIN and user ID from Amazon reviews and create better unique ids.
    Args:
        df: source dataframe
        column: name of column with ids to replace
        suffix_val: new suffix value for unique codes. Example: all new user_ids could end
        with '00000'
    Returns:
        new_id_dict: New Spark dataframe with column of new unique ids
    
    """
    unique_vals = list(set([old_id[0] for old_id in df.select(column).collect()]))
    new_ids = [(str(i) + suffix_val) for i in range(1,len(unique_vals)+1)]
    new_id_dict = {k:v for k,v in zip(unique_vals, new_ids)}
    return new_id_dict

In [7]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '0000')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '2222')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '4444')

In [8]:
all_items_asins = new_comic_asins
all_items_asins = all_items_asins.update(new_mtv_asins)

In [9]:
def add_new_id(old_id, new_id_dict):
    new_id = id_dict[old_id]
    return new_id

In [10]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())

udfComicId = F.udf(lambda x: new_comic_asins[x], StringType())
udfMovieId = F.udf(lambda x: new_mtv_asins[x], StringType())

In [11]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfComicId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [12]:
comic_r_final = comic_reviews_updated.select(['user_id','item_id', 'overall'])

In [13]:
comic_r_final.show(1)

+--------+--------+-------+
| user_id| item_id|overall|
+--------+--------+-------+
|11920000|49362222|    4.0|
+--------+--------+-------+
only showing top 1 row



In [14]:
movie_reviews_updated = movie_reviews.withColumn("user_id", udfUserId("reviewerID"))
movie_reviews_updated_2 = movie_reviews_updated.withColumn("item_id", udfMovieId("asin"))

In [15]:
movies_r_final = movie_reviews_updated_2.select(['user_id', 'item_id', 'overall'])

In [16]:
movies_r_final.show(1)

+--------+---------+-------+
| user_id|  item_id|overall|
+--------+---------+-------+
|93150000|259104444|    4.0|
+--------+---------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

#### Removing users who have rated less than average in either comics or movies

In [68]:
query = """
SELECT 
    item_id 
,   COUNT(*) as count 
FROM 
    table 
GROUP BY item_id
ORDER BY count desc"""

In [69]:
comic_r_final.createOrReplaceTempView('table')
comic_low_reviewers = spark.sql(query).toPandas()

In [70]:
low_reviews = comic_low_reviewers[comic_low_reviewers['count'] \
                                  <= comic_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 4113


In [71]:
movies_r_final.createOrReplaceTempView('table')
mtv_low_reviewers = spark.sql(query).toPandas()

In [72]:
low_mtv_reviews = mtv_low_reviewers[mtv_low_reviewers['count'] \
                                  <= mtv_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_mtv_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 33977


In [73]:
low_item = list(set(low_reviews['item_id'].tolist() + low_mtv_reviews['item_id'].tolist()))
len(low_user)

9864

In [74]:
all_reviews = comic_r_final.union(movies_r_final)

In [75]:
all_rev_ready = all_reviews.filter(F.col('item_id').isin(low_user) == False)

In [76]:
all_rev_ready.count()

179817

In [77]:
als_ready = all_rev_ready.select([col("user_id").cast(IntegerType()),\
                                  col("item_id").cast(IntegerType()),\
                                  col("overall")])

In [78]:
als_ready.persist()

DataFrame[user_id: int, item_id: int, overall: double]

In [79]:
(train, test) = als_ready.randomSplit([.8,.2])

In [80]:
# Build the recommendation model using ALS
als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [81]:
train_pred = als_model.transform(train)
test_pred = als_model.transform(test)
train_pred_dropna = train_pred.dropna()
test_pred_dropna = test_pred.dropna()

In [82]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_train = evaluator.evaluate(train_pred_dropna)
rmse_test = evaluator.evaluate(test_pred_dropna)
mae_train = evaluator_2.evaluate(train_pred_dropna)
mae_test = evaluator_2.evaluate(test_pred_dropna)
print(f"Train RMSE: {rmse_train}")
print(f"Test RMSE: {rmse_test}")
print(f"Train MAE: {mae_train}")
print(f"Test MAE: {mae_test}")

Train RMSE: 0.3527473004287336
Test RMSE: 1.2474849713824216
Train MAE: 0.234743665326749
Test MAE: 0.9672643011355103


In [84]:
test_pred.show(10)

+---------+--------+-------+-----------+
|  user_id| item_id|overall| prediction|
+---------+--------+-------+-----------+
| 81990000| 2922222|    4.0|   3.611361|
| 68200000| 9762222|    5.0|   4.811065|
|101710000| 9762222|    5.0|  5.4884887|
| 22760000| 9762222|    5.0|  4.9215364|
| 41760000| 9762222|    5.0|   4.251208|
|100510000|10374444|    2.0|  1.7457337|
| 79080000|10374444|    3.0|0.015979275|
| 52380000|11934444|    4.0|  3.1411307|
| 26730000|11934444|    3.0|  3.4213014|
| 68070000|11934444|    5.0|  3.5709882|
+---------+--------+-------+-----------+
only showing top 10 rows



In [333]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

reg_test = RegressionEvaluator(predictionCol='prediction', labelCol='overall')

# create the parameter grid              
params = ParamGridBuilder().addGrid(als.regParam, [0.01,0.001,0.1])\
                           .addGrid(als.rank, [4,10,50])
                           .addGrid(als.maxIter, [5,10,15,20]).build()
             
## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als, estimatorParamMaps=params,evaluator=reg_test,parallelism=4)
best_model = cv.fit(train)

KeyboardInterrupt: 

In [85]:
users = als_ready.select(als.getUserCol()).distinct().limit(5)
userSubsetRecs = als_model.recommendForUserSubset(users, 50).toPandas()

In [87]:
one_fella = als_model.recommendForUserSubset(59720000, 50).toPandas()

Py4JError: An error occurred while calling o78473.recommendForUserSubset. Trace:
py4j.Py4JException: Method recommendForUserSubset([class java.lang.Integer, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [88]:
recommendations = als_model.recommendForAllUsers(30)

In [86]:
userSubsetRecs.head()

Unnamed: 0,user_id,recommendations
0,41640000,"[(202884444, 5.505423545837402), (85914444, 5...."
1,55810000,"[(176574444, 5.911088943481445), (224904444, 5..."
2,59720000,"[(202884444, 6.963602542877197), (113704444, 6..."
3,100310000,"[(202884444, 4.817611217498779), (183424444, 4..."
4,102120000,"[(132014444, 5.877918243408203), (202474444, 5..."


In [90]:
#def get_comic_recommendations(user_id):


In [106]:
def get_comic_recommendations(user_id):
    recs_for_user = recommendations.where(recommendations.user_id == user_id).take(1)
    all_recs = [i[0] for i in recs_for_user[0][1]]
    all_comic_recs = [i for i in all_recs if str(i).endswith('22') == True]
    for rec in all_comic_recs:
        query_reviews = f"""
                        SELECT
                            asin
                        FROM 
                            table
                        WHERE item_id == {rec}""" 
        comic_reviews_updated.createOrReplaceTempView('table')
        temp_df = spark.sql(query_reviews).toPandas()
        asin = temp_df.loc[0]['asin']
        print(comic_names.loc[comic_names['asin'] == asin, 'title'])

In [107]:
get_comic_recommendations(59720000)

1557    Astro City Shining Stars (Kurt Busiek's Astro ...
Name: title, dtype: object
413    The Legend of Luther Strode Volume 2 TP
Name: title, dtype: object
1160    Amazing Fantasy Omnibus (v. 1)
Name: title, dtype: object
285    Fables: The Deluxe Edition Book Eight
Name: title, dtype: object
2022    Incredible Hulk: Planet Skaar
Name: title, dtype: object
661    Marvel Masterworks: Incredible Hulk - Volume 2
Name: title, dtype: object


In [103]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)

In [100]:
value = ''

In [101]:
query_reviews = f"""
SELECT
    asin
FROM 
    table
WHERE item_id == {value}"""

In [320]:


all_reviews_als.createOrReplaceTempView('table')
temp_user_view = spark.sql(query_reviews).toPandas()

In [321]:
temp_user_view

Unnamed: 0,user_id,item_id,overall
0,9240000,35952222,5.0
1,9240000,345724444,4.0


In [322]:
reviewed = temp_user_view.item_id.tolist()

In [323]:
# Reviewed & recomended
recommended_asin = [key for key, val in new_comic_asins.items() if int(val) in comic_user_test]
reviewed_asin = [key for key, val in new_comic_asins.items() if int(val) in reviewed]

In [326]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)
comic_names.loc[comic_names['asin'].isin(recommended_asin), 'title']

292                                   Fables Encyclopedia
279     Thor Visionaries - Walt Simonson, Vol. 2 (Marv...
553     Essential Tomb of Dracula, Vol. 3 (Marvel Esse...
1247                      Ultimate Fantastic Four, Vol. 3
2975                   Black Widow: The Itsy-Bitsy Spider
3064                          Deadpool Classic - Volume 6
3567    S.H.I.E.L.D. by Jim Steranko: The Complete Col...
78                                                 Empire
516                            Penguin Revolution: VOL 02
669                          Penguin Revolution: Volume 3
Name: title, dtype: object

In [327]:
comic_names.loc[comic_names['asin'].isin(reviewed_asin), 'title']

1147    Amazing Spider-Man Omnibus, Vol. 1 (v. 1)
Name: title, dtype: object

In [None]:
def new_user_recs(user_id,new_ratings,rating_df,movie_title_df,num_recs):
    # turn the new_recommendations list into a spark DataFrame
    new_user_ratings = spark.createDataFrame(new_ratings,rating_df.columns)
    
    # combine the new ratings df with the rating_df
    movie_ratings_combined = rating_df.union(new_user_ratings)
    
    # split the dataframe into a train and test set
#     (training, test) = movie_ratings_combined.randomSplit([0.8, 0.2],seed=0)
    
    # create an ALS model and fit it
    als = ALS(maxIter=5,rank=50, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(movie_ratings_combined)
    
    # make recommendations for all users using the recommendForAllUsers method
    recommendations = model.recommendForAllUsers(num_recs)
    
    # get recommendations specifically for the new user that has been added to the DataFrame
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    for ranking, (movie_id, rating) in enumerate(recs_for_user[0]['recommendations']):
        movie_string = name_retriever(movie_id,movie_title_df)
        print('Recommendation {}: {}  | predicted score :{}'.format(ranking+1,movie_string,rating))