In [100]:
import pandas as pd

import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import StringType, IntegerType

from IPython.display import clear_output

from batcave import new_id_dictionary, get_missing_titles

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

### Loading comic and movie reviews

In [17]:
comic_reviews = spark.read.json('data/comic_reviews_wtitle.json')

In [18]:
movie_reviews = spark.read.json('data/movie_reviews_wtitle.json')

In [19]:
comic_reviews.show(5)

+----------+--------------------+-------+--------------+--------------------+
|      asin|               imUrl|overall|    reviewerID|               title|
+----------+--------------------+-------+--------------+--------------------+
|0345507460|http://ecx.images...|    5.0| ACO26JQ366659|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A34C35QFA4DC5J|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A3TII4RKU0ZVT4|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A1LR4Z5Z0MPYIF|The Dresden Files...|
|0345507460|http://ecx.images...|    5.0|A16L43DIFSHGMQ|The Dresden Files...|
+----------+--------------------+-------+--------------+--------------------+
only showing top 5 rows



### Creating new ids
Since the Amazon user and item ids contain both letters and numbers, I needed to give them new values for a couple reasons:
* The Spark ALS model will only take labels that are numeric and will cause an error otherwise.
* I can create ids for the comic books and movies/tv that are easily identifiable and easy to filter on.

In [20]:
new_user_ids = new_id_dictionary(comic_reviews, 'reviewerID', '00')
new_comic_asins = new_id_dictionary(comic_reviews, 'asin', '22')
new_mtv_asins = new_id_dictionary(movie_reviews, 'asin', '44')

In [21]:
all_items_asins = new_comic_asins
all_items_asins.update(new_mtv_asins)

In [22]:
udfUserId = F.udf(lambda x: new_user_ids[x], StringType())
udfItemId = F.udf(lambda x: all_items_asins[x], StringType())

In [23]:
comic_reviews_updated = comic_reviews.withColumn("item_id", udfItemId("asin"))
comic_reviews_updated = comic_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [24]:
comic_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0345507460|http://ecx.images...|    5.0|ACO26JQ366659|The Dresden Files...| 509122| 125700|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



In [143]:
movie_reviews_updated = movie_reviews.withColumn("item_id", udfItemId("asin"))
movie_reviews_updated = movie_reviews_updated.withColumn("user_id", udfUserId("reviewerID"))

In [144]:
movie_reviews_updated.show(1)

+----------+--------------------+-------+-------------+--------------------+-------+-------+
|      asin|               imUrl|overall|   reviewerID|               title|item_id|user_id|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
|0767807693|http://ecx.images...|    3.0|ADENUJJYKNHPO|Requiem for a Hea...|2707744| 215000|
+----------+--------------------+-------+-------------+--------------------+-------+-------+
only showing top 1 row



### Narrowing my dataset
To help improve the accuracy of my model, I wanted to look through removing some of the items and users that may not offer much insight because of their sparsity. Here I explored some of these features to help me decide on what to do.

#### Removing users who have rated less than average in either comics or movies

In [145]:
query = """
SELECT 
    item_id 
,   COUNT(*) as count 
FROM 
    table 
GROUP BY item_id
ORDER BY count desc"""

In [146]:
comic_reviews_updated.createOrReplaceTempView('table')
comic_low_reviewers = spark.sql(query).toPandas()

In [147]:
low_reviews = comic_low_reviewers[comic_low_reviewers['count'] \
                                  <= comic_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of comic reviews: {low_reviews.shape[0]}")

Amount of items with less than average amount of comic reviews: 4129


In [148]:
movie_reviews_updated.createOrReplaceTempView('table')
mtv_low_reviewers = spark.sql(query).toPandas()

In [149]:
low_mtv_reviews = mtv_low_reviewers[mtv_low_reviewers['count'] \
                                  <= mtv_low_reviewers['count'].mean()]
print(f"Amount of items with less than average amount of movie reviews: {low_mtv_reviews.shape[0]}")

Amount of items with less than average amount of movie reviews: 26036


In [150]:
mtv_low_reviewers.head()

Unnamed: 0,item_id,count
0,234044,170
1,579744,163
2,1900344,141
3,531144,140
4,902244,133


In [151]:
low_item = list(set(low_reviews['item_id'].tolist() + low_mtv_reviews['item_id'].tolist()))
len(low_item)

30165

In [152]:
# Joining and adding review counts
all_review_counts_df = pd.concat([mtv_low_reviewers,comic_low_reviewers])
all_review_counts = spark.createDataFrame(all_review_counts_df)

# Joining comic & movie data
all_reviews = comic_reviews_updated.union(movie_reviews_updated)

# Adding the count column
all_reviews = all_reviews.join(all_review_counts, on='item_id', how='left')

In [153]:
all_reviews.show(5)

+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|item_id|      asin|               imUrl|overall|    reviewerID|               title|user_id|count|
+-------+----------+--------------------+-------+--------------+--------------------+-------+-----+
|1003644|B0000ADXG8|http://ecx.images...|    1.0| A9XKE4OE48BNK|Doctor Who - The ...| 460400|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0| AN9J46667D80O|Doctor Who - The ...| 450600|    6|
|1003644|B0000ADXG8|http://ecx.images...|    5.0|A2P49WD75WHAG5|Doctor Who - The ...| 564200|    6|
|1003644|B0000ADXG8|http://ecx.images...|    3.0| A9ZAL2YHXSMFF|Doctor Who - The ...| 805400|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0|A27P0MW8TE1JQP|Doctor Who - The ...| 384700|    6|
|1003644|B0000ADXG8|http://ecx.images...|    4.0|A3TRXPRUYVOLSM|Doctor Who - The ...| 859500|    6|
| 101122|1401219349|http://ecx.images...|    2.0|A1ZAJCZHFV7OZD|Superman: Past an...| 314300|    1|


In [154]:
all_reviews_ready = all_reviews.filter(F.col('item_id').isin(low_item) == False)

In [157]:
# Seeing the unique count of users and items
all_reviews_ready.agg(*(F.countDistinct(col(c)).alias(c) for c in all_reviews_ready.columns)).show()

+-------+----+-----+-------+----------+-----+-------+-----+
|item_id|asin|imUrl|overall|reviewerID|title|user_id|count|
+-------+----+-----+-------+----------+-----+-------+-----+
|   7521|7521| 7361|      5|      8545| 5293|   8545|  105|
+-------+----+-----+-------+----------+-----+-------+-----+



In [156]:
# Exporting to temporarily preserve
all_reviews_ready.repartition(1).write.json("data/all_reviews")

### Case of the missing titles
Even though I had limited my set to movies/tv that have metadata, I found that some of the metadata is missing the title for movies, which is a pretty important piece for my further development!  Since I do have the ASINs, I wrote a quick function to find those missing titles by querying Amazon and then returning the first only results title.

In [3]:
all_reviews = spark.read.json('data/all_reviews.json')

In [4]:
# Getting the ASINS from products missing titles
missing_titles = all_reviews.select(['asin','title']).toPandas()
missing_asins = list(set(missing_titles.loc[missing_titles['title'].isna(), 'asin'].tolist()))

In [5]:
# Using a function to scrape Amazon for the title
missing_title_info = get_missing_titles(missing_asins)

KeyboardInterrupt: 

### Modeling and testing

In [8]:
all_reviews.persist()

DataFrame[asin: string, count: bigint, imUrl: string, item_id: string, overall: double, reviewerID: string, title: string, user_id: string]

In [10]:
als_ready = all_reviews.select([col("user_id").cast(IntegerType()),
                                  col("item_id").cast(IntegerType()),
                                  col("overall"), col("title"), col("imUrl")])

In [11]:
(train, test) = als_ready.randomSplit([.8,.2])

In [12]:
# Build the recommendation model using ALS
als = ALS(rank=3, regParam=0.1, 
          userCol='user_id', itemCol='item_id', 
          ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [13]:
# Getting predictions for test split
test_pred = als_model.transform(test)

# Filling in NaN values with average score
test_pred_df = test_pred.toPandas()
test_pred_df['prediction'].fillna(4, inplace=True)
test_pred = spark.createDataFrame(test_pred_df)

In [14]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

evaluator_2 = RegressionEvaluator(metricName="mae", labelCol="overall",
                                predictionCol="prediction")

rmse_test = evaluator.evaluate(test_pred)
mae_test = evaluator_2.evaluate(test_pred)
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")

Test RMSE: 1.201889423210076
Test MAE: 0.909753553975692


In [152]:
def get_recommendations(new_user, new_user_df):
    
    new_user_spark = spark.createDataFrame(new_user_df)
    
    ratings_all = all_reviews.select(['count', 'item_id','overall','title','user_id'])\
                             .union(new_user_spark)
    als_ready = ratings_all.select([col("user_id").cast(IntegerType()),
                                  col("item_id").cast(IntegerType()),
                                  col("overall")])
    
    als = ALS(rank=3, regParam=0.1, 
      userCol='user_id', itemCol='item_id', 
      ratingCol='overall', nonnegative=True)
    
    als_model = als.fit(als_ready)
    
    user_recommend = als_model.recommendForAllUsers(30)
    recs_for_user = user_recommend.where(user_recommend.user_id == new_user).take(1)
    all_comics = [reco[0] for reco in recs_for_user[0]['recommendations']\
                  if str(reco[0]).endswith('22') ]
    comic_titles = list(set(all_reviews.filter(col('item_id')\
                                       .isin(all_comics))\
                                       .select('title').collect()))
    for comic in comic_titles[:3]:
        print(comic[0])

In [None]:
query = "SELECT DISTINCT item_id, asin, title, count FROM table"
all_reviews_ready.createOrReplaceTempView('table')
distinct_items = spark.sql(query).toPandas()

In [31]:
distinct_items[distinct_items['title'].isna()].sort_values('count', ascending=False).head()

Unnamed: 0,item_id,asin,title,count
768,234044,B001KVZ6HK,,170
3708,579744,B008JFUPFI,,163
1027,531144,B009934S5M,,140
5212,1027144,B005LAIHXQ,,128
4786,817844,B0049P1VHS,,116


In [145]:
def get_user_reviews():
    """Take user input and create dictionary dataframe of their answers for analysis"""
    query = """
        SELECT 
            DISTINCT CAST(item_id as string) as item_id
        ,   title
        ,   count
        FROM 
            table
        WHERE 
            item_id LIKE '%44'"""
    all_reviews.createOrReplaceTempView('table')
    get_movies = spark.sql(query).toPandas()
    get_movies.sort_values('count', ascending=False, inplace=True)
    movie_rand_sample = get_movies[:500].sample(n=10)
    reviews = []
    for index, movie in movie_rand_sample.iterrows():
        print(movie['title'])
        rating = input("How would you rate this movie? (0-5): ")
        if rating == 'skip':
            clear_output()
            continue
        else:
            movie_rating = {'user_id': 101, 'overall': int(rating),
                            'item_id': int(movie['item_id']), 'count':movie['count'],
                            'title': movie['title']}
            reviews.append(movie_rating)
            clear_output()
    return pd.DataFrame(reviews)

In [146]:
new_user = get_user_reviews()

In [153]:
get_recommendations(101, new_user)

Battle Chasers Anthology HC
ASTRO CITY: THE DARK AGE, BOOK 1
Birds of Prey, Vol. 1: Of Like Minds


In [185]:
def get_comic_recommendations(user_id):
    recs_for_user = recommendations.where(recommendations.user_id == user_id).take(1)
    all_recs = [i[0] for i in recs_for_user[0][1]]
    all_comic_recs = [i for i in all_recs if str(i).endswith('22') == True]
    return als_ready.filter(col('item_id').isin(all_comic_recs)).show()

In [186]:
get_comic_recommendations(125700)

KeyboardInterrupt: 

In [103]:
comic_names = pd.read_csv('data/all_comic_asin.csv', index_col=0)

In [322]:
reviewed = temp_user_view.item_id.tolist()

In [None]:
def new_user_recs(user_id,new_ratings,rating_df,movie_title_df,num_recs):
    # turn the new_recommendations list into a spark DataFrame
    new_user_ratings = spark.createDataFrame(new_ratings,rating_df.columns)
    
    # combine the new ratings df with the rating_df
    movie_ratings_combined = rating_df.union(new_user_ratings)
    
    # split the dataframe into a train and test set
#     (training, test) = movie_ratings_combined.randomSplit([0.8, 0.2],seed=0)
    
    # create an ALS model and fit it
    als = ALS(maxIter=5,rank=50, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(movie_ratings_combined)
    
    # make recommendations for all users using the recommendForAllUsers method
    recommendations = model.recommendForAllUsers(num_recs)
    
    # get recommendations specifically for the new user that has been added to the DataFrame
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    for ranking, (movie_id, rating) in enumerate(recs_for_user[0]['recommendations']):
        movie_string = name_retriever(movie_id,movie_title_df)
        print('Recommendation {}: {}  | predicted score :{}'.format(ranking+1,movie_string,rating))

In [None]:
titles = ['B0034G4P80', '0793906091', 'B002VPE1AW', 'B004LWZWFQ', 'B008JFUQZ2']

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

als = ALS(userCol='user_id', itemCol='item_id', ratingCol='overall', nonnegative=True)

reg_test = RegressionEvaluator(predictionCol='prediction', labelCol='overall')

# create the parameter grid              
params = ParamGridBuilder().addGrid(als.regParam, [0.01,0.001,0.1])\
                           .addGrid(als.rank, [4,10,50])\
                           .addGrid(als.maxIter, [5,10,15,20]).build()
             
## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als, estimatorParamMaps=params,evaluator=reg_test,parallelism=4)
best_model = cv.fit(train)