In [111]:
import pandas as pd
import re

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import StringType, IntegerType

from batcave import new_id_dictionary, get_missing_titles, get_recommendations, get_user_reviews

In [9]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [17]:
comic_reviews = spark.read.json('data/comic_reviews_wtitle.json')

In [18]:
movie_reviews = spark.read.json('data/movie_reviews_wtitle.json')

In [19]:
comic_reviews.show(5)

+----------+--------------------+-------+--------------+--------------------+
|      asin|               imUrl|overall|    reviewerID|               title|
+----------+--------------------+-------+--------------+--------------------+
|0345507460|http://ecx.images...|    5.0| ACO26JQ366659|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A34C35QFA4DC5J|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A3TII4RKU0ZVT4|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A1LR4Z5Z0MPYIF|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A16L43DIFSHGMQ|The Dresden Files...|
+----------+--------------------+-------+--------------+--------------------+
only showing top 5 rows



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [20]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '00')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '22')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '44')

In [21]:
all_items_asins = new_comic_asins
all_items_asins.update(new_mtv_asins)

In [22]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())
udfItemId = F.udf(lambda x: all_items_asins[x], StringType())

In [23]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfItemId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [24]:
comic_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0345507460|http://ecx.images...|    5.0|ACO26JQ366659|The Dresden Files...| 509122| 125700|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



In [143]:
movie_reviews_updated = movie_reviews.withColumn("item_id", udfItemId("asin"))
movie_reviews_updated = movie_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [144]:
movie_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0767807693|http://ecx.images...|    3.0|ADENUJJYKNHPO|Requiem for a Hea...|2707744| 215000|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

#### Removing users who have rated less than average in either comics or movies

In [145]:
query = """
SELECT 
    item_id 
,   COUNT(*) as count 
FROM 
    table 
GROUP BY item_id
ORDER BY count desc"""

In [146]:
comic_reviews_updated.createOrReplaceTempView('table')
comic_low_reviewers = spark.sql(query).toPandas()

In [147]:
low_reviews = comic_low_reviewers[comic_low_reviewers['count'] \
                                  <= comic_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 4129


In [148]:
movie_reviews_updated.createOrReplaceTempView('table')
mtv_low_reviewers = spark.sql(query).toPandas()

In [149]:
low_mtv_reviews = mtv_low_reviewers[mtv_low_reviewers['count'] \
                                  <= mtv_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of movie reviews: {low_mtv_reviews.shape[0]}")

Amount of items with less than average amount of movie reviews: 26036


In [150]:
mtv_low_reviewers.head()

Unnamed: 0,item_id,count
0,234044,170
1,579744,163
2,1900344,141
3,531144,140
4,902244,133


In [151]:
low_item = list(set(low_reviews['item_id'].tolist() + low_mtv_reviews['item_id'].tolist()))
len(low_item)

30165

In [152]:
# Joining and adding review counts
all_review_counts_df = pd.concat([mtv_low_reviewers,comic_low_reviewers])
all_review_counts = spark.createDataFrame(all_review_counts_df)

# Joining comic & movie data
all_reviews = comic_reviews_updated.union(movie_reviews_updated)

# Adding the count column
all_reviews = all_reviews.join(all_review_counts, on='item_id', how='left')

In [153]:
all_reviews.show(5)

+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|item_id|      asin|               imUrl|overall|    reviewerID|               title|user_id|count|
+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|1003644|B0000ADXG8|http://ecx.images...|    1.0| A9XKE4OE48BNK|Doctor Who - The ...| 460400|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0| AN9J46667D80O|Doctor Who - The ...| 450600|    6|
|1003644|B0000ADXG8|http://ecx.images...|    5.0|A2P49WD75WHAG5|Doctor Who - The ...| 564200|    6|
|1003644|B0000ADXG8|http://ecx.images...|    3.0| A9ZAL2YHXSMFF|Doctor Who - The ...| 805400|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0|A27P0MW8TE1JQP|Doctor Who - The ...| 384700|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0|A3TRXPRUYVOLSM|Doctor Who - The ...| 859500|    6|
| 101122|1401219349|http://ecx.images...|    2.0|A1ZAJCZHFV7OZD|Superman: Past an...| 314300|    1|


In [154]:
all_reviews_ready = all_reviews.filter(F.col('item_id').isin(low_item) == False)

In [157]:
# Seeing the unique count of users and items
all_reviews_ready.agg(*(F.countDistinct(col(c)).alias(c) for c in all_reviews_ready.columns)).show()

+-------+----+-----+-------+----------+-----+-------+-----+
|item_id|asin|imUrl|overall|reviewerID|title|user_id|count|
+-------+----+-----+-------+----------+-----+-------+-----+
|   7521|7521| 7361|      5|      8545| 5293|   8545|  105|
+-------+----+-----+-------+----------+-----+-------+-----+



In [156]:
# Exporting to temporarily preserve
all_reviews_ready.repartition(1).write.json("data/all_reviews")

### Case of the missing titles
Even though I had limited my set to movies/tv that have metadata, I found that some of the metadata is missing the title for movies, which is a pretty important piece for my further development!  Since I do have the ASINs, I wrote a quick function to find those missing titles by querying Amazon and then returning the first only results title. My process was as follows:
* Get the ASINs for data missing titles
* Run function on those ASINs to return a title
* Do some clean up of titles
* Add titles back to original data and drop old listing
* Export all_reviews again

In [9]:
all_reviews = spark.read.json('data/all_reviews.json')

In [10]:
# Getting the ASINS from products missing titles
missing_titles = all_reviews.select(['asin','title']).toPandas()

missing_asins = list(set(missing_titles.loc[missing_titles['title'].isna(), 'asin'].tolist()))

In [11]:
print(f" Records missing name: {len(missing_asins)}")

 Records missing name: 2205


In [20]:
# Using a function to scrape Amazon for the title - Takes a few hours to complete
missing_title_info = get_missing_titles(missing_asins[:250])

In [22]:
# Drop these into a dataframe to inspect
missing_df = pd.DataFrame(missing_title_info)

In [40]:
missing_df.to_csv('data/missing_titles.csv')

In [5]:
missing_df = pd.read_csv('data/missing_titles.csv')

In [6]:
# Still had some that returned no name, so ignoring for now and will drop from all reviews
all_the_good = missing_df[missing_df['title'] != 'None found']

In [12]:
#Setting ASIN as index to do replace with original set
all_the_good.set_index('asin', inplace=True)
missing_titles.set_index('asin', inplace=True)
all_the_good.head()

Unnamed: 0_level_0,Unnamed: 0,title
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B0023BZ65S,0,Big Man Japan
B00404ME2E,1,Space Precinct 2040: The Complete Series
B000WZEZFY,3,Superbad
B009VL28W2,4,Quebec Magnetic
B00BN3ED8I,5,Movie 43


In [13]:
# Combining to replace missing titles
missing_titles = missing_titles.combine_first(all_the_good)

# Resetting the index and changing title column name 
missing_titles.reset_index(inplace=True)

In [15]:
# Getting a list of all without a title still
check_missing_asins = list(set(missing_df.loc[missing_titles['title'].isna(), 'asin'].tolist()))

# Dropping rows with reviews that have no title from original dataframe
all_reviews_less_missing_titles = all_reviews.filter(F.col('asin').isin(check_missing_asins)==False)

# Dropping rows from my temporary dataframe with correct names
fix_titles = missing_titles.drop(missing_titles[missing_titles['title'].isna()].index)

In [17]:
# Some text cleanup on titles. There may be more later, but these are the initial examples I found:
potential_regs = """\[VHS\]|\[DVD\]|Collector\'s Edition|\: Season \d+
                    |\: The Complete Series|\: Complete Series|\(.*\)"""

fix_titles['title'] = fix_titles['title']\
                         .apply(lambda x: re.sub(potential_regs, '', x))

In [20]:
fix_titles.drop('Unnamed: 0', axis=1, inplace=True)
fix_titles.head()

Unnamed: 0,asin,title
0,5119367,Joseph
1,5119367,Joseph
2,5119367,Joseph
3,5119367,Joseph
4,307142469,Frosty the Snowman


In [21]:
# Create Spark dataframe from dataframe with corrected titles
fixed_titles_spark = spark.createDataFrame(fix_titles)

In [22]:
# Combining & exporting
all_reviews_no_title = all_reviews_less_missing_titles.select(['asin',
                                                               'count',
                                                               'imUrl',
                                                               'item_id',
                                                               'overall',
                                                               'reviewerID',
                                                               'user_id'])

all_reviews_with_fixed_titles = all_reviews_no_title.join(fixed_titles_spark,
                                                                     on='asin',
                                                                     how='left')

In [23]:
all_reviews_with_fixed_titles.show(10)

+----------+-----+--------------------+-------+-------+-------------+-------+--------------------+
|      asin|count|               imUrl|item_id|overall|   reviewerID|user_id|               title|
+----------+-----+--------------------+-------+-------+-------------+-------+--------------------+
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|0345507460|   57|http://ecx.images...| 509122|    5.0|ACO26JQ366659| 125700|The Dresden Files...|
|034550746

In [24]:
all_reviews_with_fixed_titles.repartition(1).write.json('data/all_reviews_corrections')

### Modeling and testing

In [64]:
all_reviews  = spark.read.json('data/all_reviews_fixed_titles.json')

In [20]:
all_reviews.persist()

DataFrame[asin: string, count: bigint, imUrl: string, item_id: string, overall: double, reviewerID: string, title: string, user_id: string]

In [24]:
als_ready = all_reviews.select([F.col("user_id").cast(IntegerType()),
                                  F.col("item_id").cast(IntegerType()),
                                  F.col("overall"), F.col("title"), F.col("imUrl")])

In [25]:
(train, test) = als_ready.randomSplit([.8,.2])

In [36]:
# Build the recommendation model using ALS
als = ALS(rank=5, regParam=0.1, maxIter=10, 
          userCol='user_id', itemCol='item_id', 
          ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [37]:
# Getting predictions for test split
test_pred = als_model.transform(test)

# Filling in NaN values with average score
test_pred_df = test_pred.toPandas()
test_pred_df['prediction'].fillna(3, inplace=True)
test_pred = spark.createDataFrame(test_pred_df)

In [38]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_test = evaluator.evaluate(test_pred)
mae_test = evaluator_2.evaluate(test_pred)
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")

Test RMSE: 1.225749805772019
Test MAE: 0.9468618307601508


In [61]:
new_user = get_user_reviews()

In [63]:
new_user

Unnamed: 0,count,item_id,overall,title,user_id
0,42,3167844,3,Charlie and the Chocolate Factory,101
1,133,902244,4,The Lord of the Rings: The Motion Picture Tril...,101
2,25,889244,2,Bridesmaids,101
3,53,1789744,4,Harry Potter and the Goblet of Fire [HD DVD],101
4,36,2785144,1,Alice in Wonderland,101
5,27,990644,5,Sherlock: Season 3,101
6,43,1114344,3,Harry Potter and the Chamber of Secrets,101
7,27,2148644,1,The Great Gatsby,101
8,49,1353544,2,The Wizard of Oz,101
9,29,1621744,1,Little Miss Sunshine [Region 2],101


In [62]:
get_recommendations(new_user)

FBP: Federal Bureau of Physics Vol. 1: The Paradigm Shift
Elfquest: The Discovery
The Green Lantern Chronicles Vol. 1


In [132]:
top_movie_url_query = """
SELECT
    DISTINCT asin
,   imUrl
,   count
FROM
    all_reviews
WHERE 
    item_id LIKE "%44"
ORDER BY 
    count DESC
LIMIT 1000
"""

In [133]:
all_reviews.createOrReplaceTempView('all_reviews')
image_urls = spark.sql(top_movie_url_query).toPandas()

In [134]:
image_urls.head()

Unnamed: 0,asin,imUrl,count
0,B001KVZ6HK,http://ecx.images-amazon.com/images/I/617wvf8S...,170
1,B008JFUPFI,http://ecx.images-amazon.com/images/I/51vg6U6f...,163
2,B00005JPY0,http://ecx.images-amazon.com/images/I/51pGxSkk...,141
3,B009934S5M,http://ecx.images-amazon.com/images/I/51K7eJ6I...,140
4,B0001VL0K2,http://ecx.images-amazon.com/images/I/51z4rwl-...,133


In [135]:
missing_urls = image_urls[image_urls['imUrl'].isna()]
active_urls = image_urls[image_urls['imUrl'].isna() == False]

In [137]:
import urllib

In [143]:
active_urls.loc[0,'imUrl'][-4:]

'.jpg'

In [154]:
active_urls.head()

Unnamed: 0,asin,imUrl,count
0,B001KVZ6HK,http://ecx.images-amazon.com/images/I/617wvf8S...,170
1,B008JFUPFI,http://ecx.images-amazon.com/images/I/51vg6U6f...,163
2,B00005JPY0,http://ecx.images-amazon.com/images/I/51pGxSkk...,141
3,B009934S5M,http://ecx.images-amazon.com/images/I/51K7eJ6I...,140
4,B0001VL0K2,http://ecx.images-amazon.com/images/I/51z4rwl-...,133


In [156]:
for index, movie in active_urls.iterrows():
    url = movie['imUrl']
    filename = 'images/' + movie['asin'] + movie['imUrl'][-4:]
    urllib.request.urlretrieve(url, filename)

* Remove any titles I don't want to feature
* Do further name cleaning
* Figure out way to get featured images for both
* Hulk vs?, Iron Man: The Art of Iron Man 2
* Bad name features still in place: (DVD), (Widescreen Edition) - anything in parenthesis! [Blu-ray]

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

reg_test = RegressionEvaluator(predictionCol='prediction', labelCol='overall')

# create the parameter grid              
params = ParamGridBuilder().addGrid(als.regParam, [0.01,0.001,0.1])\
                           .addGrid(als.rank, [4,10,50])\
                           .addGrid(als.maxIter, [5,10,15,20]).build()
             
## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als, estimatorParamMaps=params,evaluator=reg_test,parallelism=4)
best_model = cv.fit(train)