In [15]:
import pandas as pd

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext
from pyspark.sql.types import StringType, IntegerType

from batcave import new_id_dictionary

In [16]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [17]:
comic_reviews = spark.read.json('data/comic_reviews_wtitle.json')

In [18]:
movie_reviews = spark.read.json('data/movie_reviews_wtitle.json')

In [19]:
comic_reviews.show(5)

+----------+--------------------+-------+--------------+--------------------+
|      asin|               imUrl|overall|    reviewerID|               title|
+----------+--------------------+-------+--------------+--------------------+
|0345507460|http://ecx.images...|    5.0| ACO26JQ366659|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A34C35QFA4DC5J|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A3TII4RKU0ZVT4|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A1LR4Z5Z0MPYIF|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A16L43DIFSHGMQ|The Dresden Files...|
+----------+--------------------+-------+--------------+--------------------+
only showing top 5 rows



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [20]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '00')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '22')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '44')

In [21]:
all_items_asins = new_comic_asins
all_items_asins.update(new_mtv_asins)

In [22]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())
udfItemId = F.udf(lambda x: all_items_asins[x], StringType())

In [23]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfItemId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [24]:
comic_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0345507460|http://ecx.images...|    5.0|ACO26JQ366659|The Dresden Files...| 509122| 125700|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



In [25]:
movie_reviews_updated = movie_reviews.withColumn("user_id", udfUserId("reviewerID"))
movie_reviews_updated = movie_reviews_updated.withColumn("item_id", udfItemId("asin"))

In [26]:
movie_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|user_id|item_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0767807693|http://ecx.images...|    3.0|ADENUJJYKNHPO|Requiem for a Hea...| 215000|2707744|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

#### Removing users who have rated less than average in either comics or movies

In [28]:
query = """
SELECT 
    item_id 
,   COUNT(*) as count 
FROM 
    table 
GROUP BY item_id
ORDER BY count desc"""

In [30]:
comic_reviews_updated.createOrReplaceTempView('table')
comic_low_reviewers = spark.sql(query).toPandas()

In [31]:
low_reviews = comic_low_reviewers[comic_low_reviewers['count'] \
                                  <= comic_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 4129


In [33]:
movie_reviews_updated.createOrReplaceTempView('table')
mtv_low_reviewers = spark.sql(query).toPandas()

In [34]:
low_mtv_reviews = mtv_low_reviewers[mtv_low_reviewers['count'] \
                                  <= mtv_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_mtv_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 26036


In [36]:
mtv_low_reviewers.head()

Unnamed: 0,item_id,count
0,234044,170
1,579744,163
2,1900344,141
3,531144,140
4,902244,133


In [37]:
low_item = list(set(low_reviews['item_id'].tolist() + low_mtv_reviews['item_id'].tolist()))
len(low_item)

30165

In [38]:
all_reviews = comic_reviews_updated.union(movie_reviews_updated)

In [39]:
all_reviews_ready = all_reviews.filter(F.col('item_id').isin(low_item) == False)

In [40]:
all_reviews_ready.count()

122015

In [49]:
all_reviews_ready.repartition(1).write.json("data/all_reviews")

In [43]:
# Seeing the unique count of users and items
all_reviews_ready.agg(*(F.countDistinct(col(c)).alias(c) for c in all_reviews_ready.columns)).show()

+-----+-----+-------+----------+-----+-------+-------+
| asin|imUrl|overall|reviewerID|title|item_id|user_id|
+-----+-----+-------+----------+-----+-------+-------+
|33557|32593|      5|      8926|19577|  10244|  39507|
+-----+-----+-------+----------+-----+-------+-------+



In [79]:
als_ready = all_reviews_ready.select([col("user_id").cast(IntegerType()),
                                  col("item_id").cast(IntegerType()),
                                  col("overall"), col("title"), col("imUrl")])

In [80]:
als_ready.persist()

DataFrame[user_id: int, item_id: int, overall: double, title: string, imUrl: string]

In [82]:
(train, test) = als_ready.randomSplit([.8,.2])

In [83]:
# Build the recommendation model using ALS
als = ALS(rank=5, userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [104]:
comic_reviews_updated.select(F.avg('overall')).show()

+-----------------+
|     avg(overall)|
+-----------------+
|4.087415946205572|
+-----------------+



In [105]:
movie_reviews_updated.select(F.avg('overall')).show()

+-----------------+
|     avg(overall)|
+-----------------+
|3.929546922135374|
+-----------------+



In [108]:
test_pred = als_model.transform(test)

test_pred_df = test_pred.toPandas()
test_pred_df['prediction'].fillna(4, inplace=True)

In [None]:
test_pred = spark.

In [75]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_test = evaluator.evaluate(test_pred_dropna)
mae_test = evaluator_2.evaluate(test_pred_dropna)
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")

Test RMSE: 1.3026667039204765
Test MAE: 1.002017881741057


In [121]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

reg_test = RegressionEvaluator(predictionCol='prediction', labelCol='overall')

# create the parameter grid              
params = ParamGridBuilder().addGrid(als.regParam, [0.01,0.001,0.1])\
                           .addGrid(als.rank, [4,10,50])\
                           .addGrid(als.maxIter, [5,10,15,20]).build()
             
## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als, estimatorParamMaps=params,evaluator=reg_test,parallelism=4)
best_model = cv.fit(train)

KeyboardInterrupt: 

In [76]:
users = als_ready.select(als.getUserCol()).distinct().limit(5)
userSubsetRecs = als_model.recommendForUserSubset(users, 50).toPandas()

In [77]:
userSubsetRecs.head()

Unnamed: 0,user_id,recommendations
0,271000,"[(860400, 6.818143844604492), (181400, 6.65950..."
1,652700,"[(837500, 7.0491814613342285), (765000, 6.9428..."
2,741900,"[(569000, 5.813330173492432), (63400, 5.584647..."
3,795200,"[(477100, 7.910429000854492), (304600, 7.74447..."


In [78]:
als_ready.show(1)

+-------+-------+-------+
|user_id|item_id|overall|
+-------+-------+-------+
| 125700| 509122|    5.0|
+-------+-------+-------+
only showing top 1 row



In [87]:
one_fella = als_model.recommendForUserSubset(59720000, 50).toPandas()

Py4JError: An error occurred while calling o78473.recommendForUserSubset. Trace:
py4j.Py4JException: Method recommendForUserSubset([class java.lang.Integer, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [88]:
recommendations = als_model.recommendForAllUsers(30)

In [86]:
userSubsetRecs.head()

Unnamed: 0,user_id,recommendations
0,41640000,"[(202884444, 5.505423545837402), (85914444, 5...."
1,55810000,"[(176574444, 5.911088943481445), (224904444, 5..."
2,59720000,"[(202884444, 6.963602542877197), (113704444, 6..."
3,100310000,"[(202884444, 4.817611217498779), (183424444, 4..."
4,102120000,"[(132014444, 5.877918243408203), (202474444, 5..."


In [90]:
#def get_comic_recommendations(user_id):


In [131]:
def create_review(user_id, item_id, rating):
    """Return dictionary for review terms"""
    return {'user_id': user_id, 'item_id': item_id, 'overall': rating}

def create_review_dataframe(user_id, item_list, rating_list):
    rating_data = []
    for i in range(len(item_list)):
        rating_data.append(create_review(user_id, item_list[i], rating_list[i]))
    return pd.DataFrame(rating_data)


In [134]:
items_list = [161824444, 367894444, 383864444, 121504444, 291694444]
rating_list = [5,5,5,5,5]

In [136]:
movies_dd = spark.read.json('data/all_moviestv_asins.json')

In [137]:
movies_dd.show(5)

+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+------------+--------------------+
|      asin|brand|          categories|         description|               imUrl|price|             related|   salesRank|               title|
+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+------------+--------------------+
|0005048524|     | [[Movies & TV, TV]]|            VHS TAPE|http://ecx.images...|16.35|                null|[, 816828,,]|Dinosaurs and the...|
|0005119367|     | [[Movies & TV, TV]]|Ben Kingsley(star...|http://ecx.images...| 6.88|[[6303257828, 000...|[, 408295,,]|        Joseph [VHS]|
|0005019281|     |[[Movies & TV, Mo...|In Depression-era...|http://ecx.images...|34.88|[[0780630084, 078...|[, 508334,,]|An American Chris...|
|030714142X| null|[[CDs & Vinyl, Ch...|                null|http://ecx.images...| null|[[0307142418, 630...|[, 427298,,]|Encyclopedia Brow...|

In [139]:
titles

['B0034G4P80', '0793906091', 'B002VPE1AW', 'B004LWZWFQ', 'B008JFUQZ2']

In [106]:
def get_comic_recommendations(user_id):
    recs_for_user = recommendations.where(recommendations.user_id == user_id).take(1)
    all_recs = [i[0] for i in recs_for_user[0][1]]
    all_comic_recs = [i for i in all_recs if str(i).endswith('22') == True]
    for rec in all_comic_recs:
        query_reviews = f"""
                        SELECT
                            asin
                        FROM 
                            table
                        WHERE item_id == {rec}""" 
        comic_reviews_updated.createOrReplaceTempView('table')
        temp_df = spark.sql(query_reviews).toPandas()
        asin = temp_df.loc[0]['asin']
        print(comic_names.loc[comic_names['asin'] == asin, 'title'])

In [142]:
get_comic_recommendations(102120000)

1160    Amazing Fantasy Omnibus (v. 1)
Name: title, dtype: object
270    Essential Spider-Man Vol. 1
Name: title, dtype: object
661    Marvel Masterworks: Incredible Hulk - Volume 2
Name: title, dtype: object
413    The Legend of Luther Strode Volume 2 TP
Name: title, dtype: object
326    Wrath of the Spectre
Name: title, dtype: object
339    Morning Glories, Vol. 3: P.E.
Name: title, dtype: object
2070    Totally Useless MAD
Name: title, dtype: object


In [103]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)

In [108]:
movie_data = spark.read.json('data/all_moviestv_asins.json')

In [116]:
movie_data.select('title').show(10)

+--------------------+
|               title|
+--------------------+
|Dinosaurs and the...|
|        Joseph [VHS]|
|An American Chris...|
|Encyclopedia Brow...|
|Frosty the Snowma...|
|Mouse on the Mayf...|
|The Little Drumme...|
|Mad Mad Mad Monst...|
|Santa Claus Is Co...|
|Frog and Toad Are...|
+--------------------+
only showing top 10 rows



In [101]:
query_reviews = f"""
SELECT
    item_id
FROM 
    table
WHERE item_id == {value}"""

In [320]:


all_reviews_als.createOrReplaceTempView('table')
temp_user_view = spark.sql(query_reviews).toPandas()

In [321]:
temp_user_view

Unnamed: 0,user_id,item_id,overall
0,9240000,35952222,5.0
1,9240000,345724444,4.0


In [322]:
reviewed = temp_user_view.item_id.tolist()

In [323]:
# Reviewed & recomended
recommended_asin = [key for key, val in new_comic_asins.items() if int(val) in comic_user_test]
reviewed_asin = [key for key, val in new_comic_asins.items() if int(val) in reviewed]

In [326]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)
comic_names.loc[comic_names['asin'].isin(recommended_asin), 'title']

292                                   Fables Encyclopedia
279     Thor Visionaries - Walt Simonson, Vol. 2 (Marv...
553     Essential Tomb of Dracula, Vol. 3 (Marvel Esse...
1247                      Ultimate Fantastic Four, Vol. 3
2975                   Black Widow: The Itsy-Bitsy Spider
3064                          Deadpool Classic - Volume 6
3567    S.H.I.E.L.D. by Jim Steranko: The Complete Col...
78                                                 Empire
516                            Penguin Revolution: VOL 02
669                          Penguin Revolution: Volume 3
Name: title, dtype: object

In [327]:
comic_names.loc[comic_names['asin'].isin(reviewed_asin), 'title']

1147    Amazing Spider-Man Omnibus, Vol. 1 (v. 1)
Name: title, dtype: object

In [None]:
def new_user_recs(user_id,new_ratings,rating_df,movie_title_df,num_recs):
    # turn the new_recommendations list into a spark DataFrame
    new_user_ratings = spark.createDataFrame(new_ratings,rating_df.columns)
    
    # combine the new ratings df with the rating_df
    movie_ratings_combined = rating_df.union(new_user_ratings)
    
    # split the dataframe into a train and test set
#     (training, test) = movie_ratings_combined.randomSplit([0.8, 0.2],seed=0)
    
    # create an ALS model and fit it
    als = ALS(maxIter=5,rank=50, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(movie_ratings_combined)
    
    # make recommendations for all users using the recommendForAllUsers method
    recommendations = model.recommendForAllUsers(num_recs)
    
    # get recommendations specifically for the new user that has been added to the DataFrame
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    for ranking, (movie_id, rating) in enumerate(recs_for_user[0]['recommendations']):
        movie_string = name_retriever(movie_id,movie_title_df)
        print('Recommendation {}: {}  | predicted score :{}'.format(ranking+1,movie_string,rating))

In [None]:
titles = ['B0034G4P80', '0793906091', 'B002VPE1AW', 'B004LWZWFQ', 'B008JFUQZ2']