In [1]:
import numpy as np
import pandas as pd
import re

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import StringType, IntegerType
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from batcave.scrape_and_clean import new_id_dictionary, get_missing_titles
from batcave.recommend import get_recommendations, get_user_reviews_testing

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [3]:
comic_reviews = spark.read.json('data/comic_reviews_wtitle.json')

In [4]:
movie_reviews = spark.read.json('data/movie_and_video_wtitles.json')

In [5]:
comic_reviews.show(5)

+----------+--------------------+-------+--------------+--------------------+
|      asin|               imUrl|overall|    reviewerID|               title|
+----------+--------------------+-------+--------------+--------------------+
|0345507460|http://ecx.images...|    5.0| ACO26JQ366659|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A34C35QFA4DC5J|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A3TII4RKU0ZVT4|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A1LR4Z5Z0MPYIF|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A16L43DIFSHGMQ|The Dresden Files...|
+----------+--------------------+-------+--------------+--------------------+
only showing top 5 rows



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [6]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '00')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '22')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '44')

In [7]:
all_items_asins = new_comic_asins
all_items_asins.update(new_mtv_asins)

In [8]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())
udfItemId = F.udf(lambda x: all_items_asins[x], StringType())

In [9]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfItemId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [10]:
comic_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0345507460|http://ecx.images...|    5.0|ACO26JQ366659|The Dresden Files...|   1022| 170700|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



In [11]:
movie_reviews_updated = movie_reviews.withColumn("item_id", udfItemId("asin"))
movie_reviews_updated = movie_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [12]:
movie_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0767807693|http://ecx.images...|    3.0|ADENUJJYKNHPO|Requiem for a Hea...|3308844| 857000|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

#### Removing users who have rated less than average in either comics or movies

In [13]:
query = """
SELECT 
    item_id 
,   COUNT(*) as count 
FROM 
    table 
GROUP BY item_id
ORDER BY count desc"""

In [14]:
comic_reviews_updated.createOrReplaceTempView('table')
comic_low_reviewers = spark.sql(query).toPandas()

In [15]:
low_reviews = comic_low_reviewers[comic_low_reviewers['count'] \
                                  <= comic_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 4129


In [16]:
movie_reviews_updated.createOrReplaceTempView('table')
mtv_low_reviewers = spark.sql(query).toPandas()

In [17]:
low_mtv_reviews = mtv_low_reviewers[mtv_low_reviewers['count'] \
                                  <= mtv_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of movie reviews: {low_mtv_reviews.shape[0]}")

Amount of items with less than average amount of movie reviews: 28082


In [18]:
mtv_low_reviewers.head()

Unnamed: 0,item_id,count
0,1384944,170
1,262544,163
2,1181344,141
3,2024944,140
4,3394544,133


In [19]:
low_item = list(set(low_reviews['item_id'].tolist() + low_mtv_reviews['item_id'].tolist()))
len(low_item)

32211

In [20]:
# Joining and adding review counts
all_review_counts_df = pd.concat([mtv_low_reviewers,comic_low_reviewers])
all_review_counts = spark.createDataFrame(all_review_counts_df)

# Joining comic & movie data
all_reviews = comic_reviews_updated.union(movie_reviews_updated)

# Adding the count column
all_reviews = all_reviews.join(all_review_counts, on='item_id', how='left')

In [21]:
all_reviews.show(5)

+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|item_id|      asin|               imUrl|overall|    reviewerID|               title|user_id|count|
+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|1003644|6302241235|http://ecx.images...|    5.0|A11MJ9C49FAVE7|Watch the Birdie ...| 832700|    1|
| 101122|0785102205|http://ecx.images...|    3.0|A29LH2W58UBG04| Best of Marvel 1996|  67900|    1|
|1027644|6303215653|http://ecx.images...|    5.0|A2KWQ64TRHB3YH|Inspector Morse -...| 365200|    1|
|1028644|6302593093|http://ecx.images...|    5.0|A31TVT9DSU5HL1|A Very Brady Chri...| 212100|    1|
| 102944|B00AY7B7EM|http://ecx.images...|    5.0|A31TVT9DSU5HL1|                null| 212100|    2|
+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
only showing top 5 rows



In [22]:
all_reviews_ready = all_reviews.filter(F.col('item_id').isin(low_item) == False)

In [23]:
# Seeing the unique count of users and items
all_reviews_ready.agg(*(F.countDistinct(F.col(c)).alias(c) for c in all_reviews_ready.columns)).show()

+-------+----+-----+-------+----------+-----+-------+-----+
|item_id|asin|imUrl|overall|reviewerID|title|user_id|count|
+-------+----+-----+-------+----------+-----+-------+-----+
|   7872|7872| 7362|      5|      8565| 5294|   8565|  105|
+-------+----+-----+-------+----------+-----+-------+-----+



In [24]:
# Exporting to temporarily preserve
all_reviews_ready.repartition(1).write.json("data/all_reviews")

### Case of the missing titles
Even though I had limited my set to movies/tv that have metadata, I found that some of the metadata is missing the title for movies, which is a pretty important piece for my further development!  Since I do have the ASINs, I wrote a quick function to find those missing titles by querying Amazon and then returning the first only results title. My process was as follows:
* Get the ASINs for data missing titles
* Run function on those ASINs to return a title
* Do some clean up of titles
* Add titles back to original data and drop old listing
* Export all_reviews again

In [25]:
all_reviews = spark.read.json('data/all_reviews.json')

In [26]:
# Getting the ASINS from products missing titles
missing_titles = all_reviews.select(['asin','title']).toPandas()

missing_asins = list(set(missing_titles.loc[missing_titles['title'].isna(), 'asin'].tolist()))

In [27]:
print(f" Records missing name: {len(missing_asins)}")

 Records missing name: 2205


In [28]:
# Using a function to scrape Amazon for the title - Takes a few hours to complete
missing_title_info = get_missing_titles(missing_asins)

In [29]:
# Drop these into a dataframe to inspect
missing_df = pd.DataFrame(missing_title_info)

In [30]:
missing_df.head()

Unnamed: 0,asin,title
0,B0055UA1KO,Beast Wars: Transformers - The Complete Series
1,B00G7QPXAI,Austenland
2,B006GHA9QQ,Andrew Lloyd Webber's Love Never Dies
3,B000UNYJVI,Welcome to the Jungle
4,B0034G4P94,Warrior


In [31]:
missing_df.to_csv('data/missing_titles.csv')

In [32]:
missing_df = pd.read_csv('data/missing_titles.csv')

In [33]:
# Still had some that returned no name, so ignoring for now and will drop from all reviews
all_the_good = missing_df[missing_df['title'] != 'None found']

In [34]:
#Setting ASIN as index to do replace with original set
all_the_good.set_index('asin', inplace=True)
missing_titles.set_index('asin', inplace=True)
all_the_good.head()

Unnamed: 0_level_0,Unnamed: 0,title
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B0055UA1KO,0,Beast Wars: Transformers - The Complete Series
B00G7QPXAI,1,Austenland
B006GHA9QQ,2,Andrew Lloyd Webber's Love Never Dies
B000UNYJVI,3,Welcome to the Jungle
B0034G4P94,4,Warrior


In [35]:
# Combining to replace missing titles
missing_titles = missing_titles.combine_first(all_the_good)

# Resetting the index and changing title column name 
missing_titles.reset_index(inplace=True)

In [36]:
# Getting a list of all without a title still
check_missing_asins = list(set(missing_df.loc[missing_titles['title'].isna(), 'asin'].tolist()))

# Dropping rows with reviews that have no title from original dataframe
all_reviews_less_missing_titles = all_reviews.filter(F.col('asin').isin(check_missing_asins)==False)

# Dropping rows from my temporary dataframe with correct names
fix_titles = missing_titles.drop(missing_titles[missing_titles['title'].isna()].index)

In [37]:
# Some text cleanup on titles. There may be more later, but these are the initial examples I found:
potential_regs = """\[VHS\]|\[DVD\]|Collector\'s Edition|\: Season \d+
                    |\: The Complete Series|\: Complete Series|\(.*\)"""

fix_titles['title'] = fix_titles['title']\
                         .apply(lambda x: re.sub(potential_regs, '', x))

In [38]:
fix_titles.drop('Unnamed: 0', axis=1, inplace=True)
fix_titles.head()

Unnamed: 0,asin,title
0,5119367,Joseph
1,5119367,Joseph
2,5119367,Joseph
3,5119367,Joseph
4,307142469,Frosty the Snowman


In [39]:
# Create Spark dataframe from dataframe with corrected titles
fixed_titles_spark = spark.createDataFrame(fix_titles)

In [42]:
# Combining & exporting
all_reviews_no_title = all_reviews_less_missing_titles.select(['asin',
                                                               'count',
                                                               'imUrl',
                                                               'item_id',
                                                               'overall',
                                                               'reviewerID',
                                                               'user_id'])

all_reviews_with_fixed_titles = all_reviews_no_title.join(fixed_titles_spark,
                                                                     on='asin',
                                                                     how='left').dropDuplicates()

In [43]:
all_reviews_with_fixed_titles.show(10)

+----------+-----+--------------------+-------+-------+--------------+-------+--------------------+
|      asin|count|               imUrl|item_id|overall|    reviewerID|user_id|               title|
+----------+-----+--------------------+-------+-------+--------------+-------+--------------------+
|0345507460|   57|http://ecx.images...|   1022|    5.0| ACO26JQ366659| 170700|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    5.0|A34C35QFA4DC5J| 399200|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    5.0|A3TII4RKU0ZVT4| 440000|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    5.0|A1LR4Z5Z0MPYIF| 578400|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    5.0|A16L43DIFSHGMQ| 542700|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    5.0| ANRS196NKFVUU|  81300|The Dresden Files...|
|0345507460|   57|http://ecx.images...|   1022|    1.0|A30BEBBQ3UYI7O| 446000|The Dresden Files...|


In [44]:
all_reviews_with_fixed_titles.repartition(1).write.json('data/all_reviews_fixed_titles')

### Modeling and testing

In [45]:
all_reviews  = spark.read.json('data/all_reviews_fixed_titles.json')

In [49]:
als_ready = all_reviews.select([F.col("user_id").cast(IntegerType()),
                                  F.col("item_id").cast(IntegerType()),
                                  F.col("overall"), F.col("title"), F.col("imUrl")])

In [50]:
(train, test) = als_ready.randomSplit([.8,.2])

In [51]:
# Build the recommendation model using ALS
als = ALS(rank=50, regParam=.1, maxIter=20,
          userCol='user_id', itemCol='item_id', 
          ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [52]:
test_pred_df = als_model.transform(test).toPandas()

# Filling in NaN values with average score for metric review
test_pred_df['prediction'].fillna(4, inplace=True)
test_pred = spark.createDataFrame(test_pred_df)

# Get RMSE & MAE for model
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_test = evaluator.evaluate(test_pred)
mae_test = evaluator_2.evaluate(test_pred)
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")

Test RMSE: 1.1770851592752343
Test MAE: 0.9311017282244214


### Parameter tuning for optimization
I kept running into errors with my parameter grid to cross validate below, so ran several tests as well varying parameters and landed on my best model using the configuration found in the above model test.  Below is an example of the parameters this was based on.

In [None]:
als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

reg_test = RegressionEvaluator(predictionCol='prediction', labelCol='overall')

# Parameter grid              
params = ParamGridBuilder().addGrid(als.regParam, [0.01,0.001,0.1])\
                           .addGrid(als.rank, [4,10,50])\
                           .addGrid(als.maxIter, [5,10,15,20]).build()
             
## Calling and checking evaluator
cv = CrossValidator(estimator=als, estimatorParamMaps=params,evaluator=reg_test,parallelism=4)
best_model = cv.fit(train)

### Returning recommendations
To get recommendations for new users, I first need to get the item features from my ALS model, save them with titles, and then can use that file to create and build recommendations.
Below is an example of the two functions created to perform the recommendation operation, just on the Notebook level:
* ```get_user_reviews_testing```: gives an individual a selection of movies to choose from, once they have rated at least 10, it creates and returns a dataframe with their scores.
* ```get_recommedations```: returns the top 10 comic books recommended to the user based on their input.

These functions will be used as the basis for creating a functional web application for users to explore.

In [49]:
# Build the recommendation model using ALS
als = ALS(rank=50, regParam=.1, maxIter=20,
          userCol='user_id', itemCol='item_id', 
          ratingCol='overall', nonnegative=True)

als_model = als.fit(als_ready)

In [50]:
# Get item factors from model
item_factors = als_model.itemFactors.toPandas()
item_factors.rename(columns={'id': 'item_id'}, inplace=True)
item_factors['item_id'] = item_factors['item_id'].astype(str)

In [51]:
# Get titles from original dataset
item_titles = all_reviews.select(['item_id', 'title', 'count', 'asin']).distinct().toPandas()
item_titles['item_id'] = item_titles['item_id'].astype(str)

In [52]:
# Merge into one dataframe & making the ids string for exploration later
item_details = item_factors.merge(item_titles, on='item_id', how='left')

In [None]:
# Exporting to preserve item factor details
item_details.to_json('data/als_item_factor_details.json')

In [53]:
item_df = pd.read_json('data/als_item_factor_details.json')

In [54]:
user_test = get_user_reviews_testing(item_df)

In [55]:
user_test.head()

Unnamed: 0,item_id,rating
0,715744,4
1,110544,2
2,2732444,3
3,2544244,4
4,1584644,1


In [57]:
recs = get_recommendations(item_factors_df=item_df, new_user_df=user_test)

In [58]:
recs

Unnamed: 0,item_id,title,asin,new_user_predictions
624,263422,Roots of the Swamp Thing,1401222366,4.117454
1256,524022,"Marvel Masterworks: The Sub-Mariner, Vol. 1",785108750,4.042103
173,74722,Spider-Man: Death of the Stacys,785125043,3.972947
1278,533422,Unwritten Vol. 1: Tommy Taylor and the Bogus I...,1401225659,3.941805
798,336622,Batman: Arkham Asylum A Serious House on Seri...,1401204244,3.886486


Looking good! This finishes my data prep and modeling phase.

### Understanding how to fold a new user into the model
Below is a walkthrough of getting recommendations for a user by using the existing item features and the ratings from the new user. 

In [59]:
# Getting item features from the model and setting the index to the item_id
item_factors = als_model.itemFactors.toPandas()
item_factors.index = item_factors['id']
item_factors['id'] = item_factors['id'].astype(str)

In [60]:
# Looking for some movies to rate, all movies id ends in 44
item_factors.loc[item_factors['id'].str.endswith('44')].head()

Unnamed: 0_level_0,id,features
id,Unnamed: 1_level_1,Unnamed: 2_level_1
244,244,"[0.3117918372154236, 0.3253779411315918, 0.140..."
1844,1844,"[0.1884763538837433, 0.4734590947628021, 0.523..."
3344,3344,"[0.0886940136551857, 0.354856938123703, 0.3612..."
3744,3744,"[0.0, 0.18895338475704193, 0.4126276671886444,..."
3844,3844,"[0.3835531771183014, 0.5762662291526794, 0.316..."


In [61]:
# Creating a random user with a set of ratings
user = [{'id':244, 'rating':4},{'id': 1844, 'rating': 3}, {'id': 3344, 'rating': 3}, {'id': 3744, 'rating': 3}]
user_df = pd.DataFrame(user)
user_df

Unnamed: 0,id,rating
0,244,4
1,1844,3
2,3344,3
3,3744,3


In [62]:
# Create lists of each column to calculate user matrix
item_ids = user_df.id.tolist()
user_rating = user_df.rating.tolist()

In [63]:
# User ratings 
all_ratings_array = np.array((user_rating,)).T

# Get item features for these specific movies
all_items_array = np.zeros(shape=(len(item_ids), 50))

for index, item in enumerate(item_ids):
    all_items_array[index, :] = np.array(item_factors.loc[item, 'features'])

In [64]:
# Checking the shape of each to make sure things are looking right
all_ratings_array.shape, all_items_array.shape

((4, 1), (4, 50))

In [65]:
# Least squares solution to get user features
new_user_matrix = np.linalg.lstsq(all_items_array, all_ratings_array, rcond=None)

# New users matrix!
new_user_matrix = new_user_matrix[0].reshape((50,))
new_user_matrix.shape

(50,)

In [66]:
# Checking based on known score
known_item = np.array(item_factors.loc[244,'features'])

score = np.dot(new_user_matrix, known_item)
score

4.000000000000001

Nice! Now I am going to user that factor dataframe to create predictions for this user

In [67]:
item_factors['new_user_predictions'] = item_factors['features'].apply(lambda x: np.dot(x, new_user_matrix))

In [68]:
top_five_comics = item_factors.loc[item_factors['id']\
                              .str.endswith('22'), 'new_user_predictions']\
                              .sort_values(ascending=False)[:5]

In [69]:
top_five_comics

id
217822    4.390951
261622    4.166066
404822    4.159146
330822    4.155326
500622    4.112653
Name: new_user_predictions, dtype: float64

Perfect! I can now functionize this principle and apply to getting speedy recommendations for my new users.